This notebook reports the analysis of snvs mutational signatures at constitutive origins.

1 Origin mutational signatures

1.1 Origin trinucleotide composition

Mutational signature are corrected for trinucleotide composition extracted from 1 kb origin domains (center +- 500bp) or origin flanks (+- 10 kb excluding origins).

In this chunk, we extract the trinucleotide counts in a 1bp sliding windows.

# r
library(rtracklayer)
library(BSgenome.Hsapiens.UCSC.hg19)

# At origins
# Load domain/bin coordinates
ori.domain.bed <- read.table("./Dataset/ori/BED/ori.1kb.hg19.bed")
colnames(ori.domain.bed) <- c("seqnames", "start", "end")
ori.domain.gr <- makeGRangesFromDataFrame(ori.domain.bed, keep.extra.columns = T)
my.chr <- c(1:22, "X", "Y")
ori.domain.gr <- ori.domain.gr[seqnames(ori.domain.gr) %in% my.chr]
seqlevelsStyle(ori.domain.gr) <- "UCSC"
# Recover sequences and compute trinucleotide statistics
ori.views <- Views(Hsapiens, ori.domain.gr)
ori.tri <- trinucleotideFrequency(ori.views, step=1, as.prob = FALSE)
ori.tri.df <- t(ori.tri)
colnames(ori.tri.df) <- ori.domain.gr$ori.id
write.csv(ori.tri.df, "./Dataset/ori/ori.tri.summary.csv", quote = F, row.names = T)

# At flanks
# Load domain/bin coordinates
flanks.domain.bed <- read.table("./Dataset/ori/BED/ori.flanks.hg19.bed")
colnames(flanks.domain.bed) <- c("seqnames", "start", "end")
flanks.domain.gr <- makeGRangesFromDataFrame(flanks.domain.bed, keep.extra.columns = T)
my.chr <- c(1:22, "X", "Y")
flanks.domain.gr <- flanks.domain.gr[seqnames(flanks.domain.gr) %in% my.chr]
seqlevelsStyle(flanks.domain.gr) <- "UCSC"
# Recover sequences and compute trinucleotide statistics
flanks.views <- Views(Hsapiens, flanks.domain.gr)
flanks.tri <- trinucleotideFrequency(flanks.views, step=1, as.prob = FALSE)
flanks.tri.df <- t(flanks.tri)
colnames(flanks.tri.df) <- flanks.domain.gr$flanks.id
write.csv(flanks.tri.df, "./Dataset/ori/flanks.tri.summary.csv", quote = F, row.names = T)

1.2 Mutational signatures analysis

1.2.1 Mutational signatures extraction

Mutational analysis is performed separately when considering snvs overlaping with origins (+- 1kb) or origin flanks (+- 10 kb excluding origins).

Prepare bed files.

# r
library(dplyr)

# Open manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10, V2, V3, V5)
colnames(SSM.manifest.df) <- c("donor.id", "project", "file.id", "object.id", "file.name")
snvs.manifest.df <- SSM.manifest.df %>% filter(str_detect(file.name, 'somatic.snv')) %>% separate(project, c("cancer.type", "project"), sep = "-")
# 1,950 entries, 1,830 donors

# At origins
# Open ori coordinates
ori.1kb.domains.gr <- import("./Dataset/ori/BED/ori.1kb.hg19.bed")
# List vcf files
vcf.files <- list.files("./Dataset/ICGC/SSM/VCF/") # 3,892 files
vcf.files <- vcf.files[!grepl(".idx", vcf.files)]  # 1,946 snvs vcf files
# Format and save vcf files
snvs.summary.df <- tibble()
for (i in 1:length(vcf.files)) {
print(i)
vcf.files.i <- vcf.files[i]
print(vcf.files.i)
df.i <- snvs.manifest.df %>% filter(file.name == vcf.files.i)
donor.i <- df.i$donor.id
path.i <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.i, sep = "")
bed.i <- read.table(gzfile(path.i), comment.char = "#") %>% select(V1, V2, V4, V5) %>% mutate(end = V2 + 1) %>% select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
gr.i <- makeGRangesFromDataFrame(bed.i, keep.extra.columns = T)
gr.overlap.i <- subsetByOverlaps(gr.i, ori.1kb.domains.gr)
vcf.i <- as.data.frame(gr.overlap.i) %>%
    mutate(ID = paste("rs", start, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(chr = seqnames, pos = start, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT)
colnames(vcf.i) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcf.path.i <- paste0("./002_Mut_signatures_Pan_cancer/BED/",donor.i,".bed")
write.table(vcf.i, vcf.path.i, sep="\t", col.names = T, row.names = F, quote = F)
snvs.count.i <- nrow(vcf.i)
snvs.summary.i <- cbind.data.frame(donor.id = donor.i, snvs.count = snvs.count.i)
snvs.summary.df <- rbind(snvs.summary.df, snvs.summary.i)
}
saveRDS(snvs.summary.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.summary.df.rds")

# At the flanks
# Open flanks coordinates
ori.flanks.gr <- import("./Dataset/ori/BED/ori.flanks.hg19.bed")
# Format and save vcf files
snvs.flanks.summary.df <- tibble()
for (i in 1:length(vcf.files)) {
print(i)
vcf.files.i <- vcf.files[i]
print(vcf.files.i)
df.i <- snvs.manifest.df %>% filter(file.name == vcf.files.i)
donor.i <- df.i$donor.id
path.i <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.i, sep = "")
bed.i <- read.table(gzfile(path.i), comment.char = "#") %>% select(V1, V2, V4, V5) %>% mutate(end = V2 + 1) %>% select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
gr.i <- makeGRangesFromDataFrame(bed.i, keep.extra.columns = T)
gr.overlap.i <- subsetByOverlaps(gr.i, ori.flanks.gr)
vcf.i <- as.data.frame(gr.overlap.i) %>%
    mutate(ID = paste("rs", start, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(chr = seqnames, pos = start, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT)
colnames(vcf.i) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcf.path.i <- paste0("./002_Mut_signatures_Pan_cancer/BEDflanks/",donor.i,".bed")
write.table(vcf.i, vcf.path.i, sep="\t", col.names = T, row.names = F, quote = F)
snvs.count.i <- nrow(vcf.i)
snvs.summary.i <- cbind.data.frame(donor.id = donor.i, snvs.count = snvs.count.i)
snvs.flanks.summary.df <- rbind(snvs.flanks.summary.df, snvs.summary.i)
}
saveRDS(snvs.flanks.summary.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.summary.df.rds")

Convert to vcf format.

# bash
# Add header to all bed files

# snvs at origins
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/BED/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/BED/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/VCF/$file.vcf
done

# snvs at flanks
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/BEDflanks/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/BEDflanks/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/VCFflanks/$file.vcf
done

For the rest of the analysis, only vcf files reporting 50 snvs at origins or more are considered

Generate mutation count matrices

# r
library(MutationalPatterns)
library(BSgenome.Hsapiens.UCSC.hg19)

# Load snvs count data
# at origins
snvs.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.summary.df.rds")
snvs.summary.filter.df <- snvs.summary.df %>% filter(snvs.count >= 50) # 229 donors
# at the flanks
snvs.flanks.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.summary.df.rds")
snvs.flanks.summary.df <- snvs.flanks.summary.df %>% rename(snvs.count = "snvs.flanks.count") %>% left_join(snvs.summary.df, by = "donor.id")
# Look at correlation
plot(log10(snvs.flanks.summary.df$snvs.flanks.count), log10(snvs.flanks.summary.df$snvs.count))

# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19

# Prepare files to analyse
snvs.vcf.files <- list.files("./002_Mut_signatures_Pan_cancer/VCF/") # 1,826 files
sample.names <- snvs.vcf.files %>% str_replace(".vcf", "")
sample.names.filter <- sample.names[sample.names %in% snvs.summary.filter.df$donor.id] # 227 donors
write.table(sample.names.filter, "./002_Mut_signatures_Pan_cancer/selected.donor.list.tsv", sep="\t", col.names = F, row.names = F, quote = F)
# Process snvs at origins
snvs.vcf.files.filter <- paste0(sample.names.filter, ".vcf")
snvs.vcf.path.filter <- paste0("./002_Mut_signatures_Pan_cancer/VCF/", snvs.vcf.files.filter)
# Load vcf in a gr list
snvs.ori.grl <- read_vcfs_as_granges(snvs.vcf.path.filter, sample.names.filter, ref.genome) # 227 vcf
# Save grange list
saveRDS(snvs.ori.grl, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
# Compute mutation matrices
snvs.ori.mut.mat <- mut_matrix(vcf_list = snvs.ori.grl, ref_genome = ref.genome)
# Save mutation matrices
saveRDS(snvs.ori.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.mat.rds")

# Process snvs at flanks
snvs.flanks.vcf.path.filter <- paste0("./002_Mut_signatures_Pan_cancer/VCFflanks/", snvs.vcf.files.filter)
# Load vcf in a gr list
snvs.flanks.ori.grl <- read_vcfs_as_granges(snvs.flanks.vcf.path.filter, sample.names.filter, ref.genome) # 227 vcf
# Save grange list
saveRDS(snvs.flanks.ori.grl, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
# Compute mutation matrices
snvs.flanks.ori.mut.mat <- mut_matrix(vcf_list = snvs.flanks.ori.grl, ref_genome = ref.genome)
# Save mutation matrices
saveRDS(snvs.flanks.ori.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.mut.mat.rds")

Plot representative mutation profiles

library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load mutational matrices
snvs.ori.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.mat.rds")
snvs.flanks.ori.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.mut.mat.rds")
# Plot representative profiles
# for snvs at origins
plot_96_profile(snvs.ori.mut.mat[, c(1, 50, 100, 200)], condensed = TRUE)

# for snvs at origin flanks
plot_96_profile(snvs.flanks.ori.mut.mat[, c(1, 50, 100, 200)], condensed = TRUE)

1.2.2 Mutational signatures correction

For comparing signatures at origins and at the flanks, the extracted signatures are corrected for trinucleotide composition

# r
library(tidyr)

############
# Open trinucleotide statistics
ori.tri.df <- read.csv("./Dataset/ori/ori.tri.summary.csv", row.names = 1)
flanks.tri.df <- read.csv("./Dataset/ori/flanks.tri.summary.csv", row.names = 1)
# Compute sums
trinuc.sum.df <- cbind.data.frame(tri = row.names(ori.tri.df), ori.tri = rowSums(ori.tri.df), flanks.tri = rowSums(flanks.tri.df))

###########
# Correct mutational matrices for trinucleotide composition

# for origins
snvs.ori.corr.mut.mat <- tibble(.rows = 96)
for (i in 1:ncol(snvs.ori.mut.mat)) {
print(i/ncol(snvs.ori.mut.mat))
profile.i <- as.data.frame(snvs.ori.mut.mat[,i])
colnames(profile.i) <- "count"
sample.i <- colnames(snvs.ori.mut.mat)[i]
profile.corrected.i <- profile.i %>% add_rownames(var = "Tri.mut") %>% tidyr::separate(Tri.mut, c("D", "R", "A", "U"), sep = "\\[|>|]", remove = F) %>% mutate(tri = paste0(D, R, U, sep = "")) %>% dplyr::select(-D, -R, -A, -U, -Tri.mut) %>% left_join(trinuc.sum.df, by = "tri") %>% mutate(corr.count = count/ori.tri, corr.count.norm = corr.count/sum(corr.count)) %>% select(corr.count.norm)
row.names(profile.corrected.i) <- row.names(profile.i)
colnames(profile.corrected.i) <- sample.i
snvs.ori.corr.mut.mat <- cbind.data.frame(snvs.ori.corr.mut.mat, profile.corrected.i)
}
snvs.ori.corr.mut.mat <- as.matrix(snvs.ori.corr.mut.mat)
saveRDS(snvs.ori.corr.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.corr.mut.mat.rds")

# for flanks
snvs.flanks.ori.corr.mut.mat <- tibble(.rows = 96)
for (i in 1:ncol(snvs.flanks.ori.mut.mat)) {
print(i/ncol(snvs.flanks.ori.mut.mat))
profile.i <- as.data.frame(snvs.flanks.ori.mut.mat[,i])
colnames(profile.i) <- "count"
sample.i <- colnames(snvs.flanks.ori.mut.mat)[i]
profile.corrected.i <- profile.i %>% add_rownames(var = "Tri.mut") %>% tidyr::separate(Tri.mut, c("D", "R", "A", "U"), sep = "\\[|>|]", remove = F) %>% mutate(tri = paste0(D, R, U, sep = "")) %>% dplyr::select(-D, -R, -A, -U, -Tri.mut) %>% left_join(trinuc.sum.df, by = "tri") %>% mutate(corr.count = count/flanks.tri, corr.count.norm = corr.count/sum(corr.count)) %>% select(corr.count.norm)
row.names(profile.corrected.i) <- row.names(profile.i)
colnames(profile.corrected.i) <- sample.i
snvs.flanks.ori.corr.mut.mat <- cbind.data.frame(snvs.flanks.ori.corr.mut.mat, profile.corrected.i)
}
snvs.flanks.ori.corr.mut.mat <- as.matrix(snvs.flanks.ori.corr.mut.mat)
saveRDS(snvs.flanks.ori.corr.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.corr.mut.mat.rds")

Plot representative correction of mutation profiles

library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutational matrices
snvs.ori.corr.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.corr.mut.mat.rds")
snvs.flanks.ori.corr.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.corr.mut.mat.rds")
# Plot representative profiles
i <- 160
plot_compare_profiles(snvs.ori.mut.mat[, i], snvs.ori.corr.mut.mat[, i], profile_names = c("Original", "Corrected"), condensed = TRUE)

plot_compare_profiles(snvs.flanks.ori.mut.mat[, i], snvs.flanks.ori.corr.mut.mat[, i], profile_names = c("Original", "Corrected"), condensed = TRUE)

The corrected signatures observed at the flanks are then substracted to the origin signatures

# r
# The profiles have already been normalised
# Check that the order of the columns in both matrices are identical
identical(colnames(snvs.ori.corr.mut.mat), colnames(snvs.flanks.ori.corr.mut.mat)) # --> TRUE
identical(colnames(snvs.ori.corr.mut.mat), sample(colnames(snvs.flanks.ori.corr.mut.mat))) # --> FALSE
# Substract
snvs.diff.mut.mat <- snvs.ori.corr.mut.mat - snvs.flanks.ori.corr.mut.mat
# Remove negative values and normalise values to 1
snvs.diff.mut.mat[snvs.diff.mut.mat<0] <- 0
snvs.diff.mut.mat <- as.data.frame(snvs.diff.mut.mat) %>% mutate_all(~ ./sum(.)) %>% as.matrix()
saveRDS(snvs.diff.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")

Plot representative differential mutation profiles

library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutational matrices
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Plot representative profiles
plot_96_profile(snvs.diff.mut.mat[, c(1, 50, 100, 200)], condensed = TRUE)

2 Cancer genome clustering by mutational signatures

2.1 Clustering based on cosine similarities

Cancer genomes are clustered based on the mutational profiles at origins. To do so, we use the background adjusted mutational signatures (corrected for both trinucleotide composition and rates observed at flanking domains).

# r
library(tidyverse)
library(dendextend)
library(tibble)
library(gplots)
library(rlist)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Compute cosine similarity matrix
snvs.cos.sim.mat <- cos_sim_matrix(snvs.diff.mut.mat, snvs.diff.mut.mat)
# cluster samples based on euclidean distance between relative contribution
snvs.cos.sim.hc <- hclust(dist(snvs.cos.sim.mat), method = "complete")
# order samples according to clustering
row.order <- rownames(snvs.cos.sim.mat)[snvs.cos.sim.hc$order]
# Generate dendrogram
snvs.cos.sim.dend <- as.dendrogram(snvs.cos.sim.hc)
# Define and identify number of relevant clusters
# Visual inspection reveals 5 clusters
nleaves(snvs.cos.sim.dend) # 227
## [1] 227
nnodes(snvs.cos.sim.dend) # 453
## [1] 453
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")

# Visual inspection reveals 5 clusters
# Define and cut clusters
mut.sig.clusters <- cutree(snvs.cos.sim.dend, k=5, order_clusters_as_data = FALSE)
table(mut.sig.clusters)
## mut.sig.clusters
##  1  2  3  4  5 
## 42 42 29 23 91
# Generate df reporting the information about genomes and their cluster assignments
clusters.df <- data.frame(donor.id = names(mut.sig.clusters), cluster = mut.sig.clusters)
saveRDS(clusters.df, file = "./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Cluster columns
hc.sample <- snvs.cos.sim.mat %>% t() %>% dist() %>% hclust(method = "complete")
# Define column and row order
col.order <- colnames(snvs.cos.sim.mat)[hc.sample$order]
row.order <- rownames(snvs.cos.sim.mat)[hc.sample$order]
# Make matrix long and set factor levels, to get the correct order for plotting.
snvs.cos.sim.mat.plot <- snvs.cos.sim.mat %>% as.data.frame() %>% tibble::rownames_to_column("Sample") %>%
    pivot_longer(-Sample, names_to = "Signature", values_to = "Cosine.sim") %>%
    mutate(Signature = factor(Signature, levels = col.order), Sample = factor(Sample, levels = row.order))
# Plot heatmap
snvs.cos.sim.heatmap <- snvs.cos.sim.mat.plot %>% ggplot(aes(x = Signature, y = Sample, fill = Cosine.sim, order = Sample)) +
    geom_raster() +
    scale_fill_distiller(palette = "YlOrBr", direction = 1, name = "Cosine \nsimilarity", limits = c(0, 1.000000001)) +
    theme_bw() +
    theme(aspect.ratio=1, axis.ticks = element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = NULL)
snvs.cos.sim.heatmap

pdf("./Rplots/snvs.cos.sim.dend.pdf", width=10, height=6, useDingbats=FALSE)
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")
dev.off()
## quartz_off_screen 
##                 2
pdf("./Rplots/snvs.cos.sim.heatmap.pdf", width=10, height=6, useDingbats=FALSE)
snvs.cos.sim.heatmap
dev.off()
## quartz_off_screen 
##                 2

WARNING: The heatmap is horizontally flipped in the final figure

2.2 Mutational cluster analysis

2.2.1 Primary sites

Assess primary sites distribution within each cluster

# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
table(clusters.df$cluster)
## 
##  1  2  3  4  5 
## 42 42 29 23 91
# Open manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10) %>% separate(V10, c("cancer.type", "project"), sep = "-")
colnames(SSM.manifest.df) <- c("donor.id", "cancer.type", "project")
# Load cancer project information
cancer.project.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t") %>% separate(Project.Code, c("cancer.type", "project"), sep = "-") %>% dplyr::select(cancer.type, Primary.Site)
# Add cancer/primary site information to clusters
clusters.cancer.df <- clusters.df %>% left_join(SSM.manifest.df, by = "donor.id") %>% left_join(cancer.project.info.df, by = "cancer.type") %>% distinct() %>% mutate(cancer.info = paste(cancer.type, Primary.Site, sep = " | "))
saveRDS(clusters.cancer.df, file = "./002_Mut_signatures_Pan_cancer/rds/clusters.cancer.df.rds")
# Assess primary sites distribution within each cluster
cluster.1.cancer.count <- clusters.cancer.df %>% filter(cluster == 1) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.2.cancer.count <- clusters.cancer.df %>% filter(cluster == 2) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.3.cancer.count <- clusters.cancer.df %>% filter(cluster == 3) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.4.cancer.count <- clusters.cancer.df %>% filter(cluster == 4) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.5.cancer.count <- clusters.cancer.df %>% filter(cluster == 5) %>% group_by(cancer.info) %>% summarise(count=n())
# Prepare donut charts
# Color pallette is defined in 003_Transcriptomics_cluster_analysis
hsize <- 2
#
cluster.1.cancer.donut <- cluster.1.cancer.count %>% mutate(x = hsize) %>% 
  ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
  geom_col(color = "black", alpha = 0.6) +
  geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
  coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
  ggtitle("cluster 1") +
  scale_fill_manual(values = c("#F9B233")) +
  theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.1.cancer.donut

#
cluster.2.cancer.donut <- cluster.2.cancer.count %>% mutate(x = hsize) %>% 
  ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
  geom_col(color = "black", alpha = 0.6) +
  geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
  coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
  ggtitle("cluster 2") +
  scale_fill_manual(values = c("#A084BC", "#9D9D9C", "#BE1622", "#A1D1B1", "#F9B233", "#662483", "#006633", "#29235C")) +
  theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.2.cancer.donut

#
cluster.3.cancer.donut <- cluster.3.cancer.count %>% mutate(x = hsize) %>% 
  ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
  geom_col(color = "black", alpha = 0.6) +
  geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
  coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
  ggtitle("cluster 3") +
  scale_fill_manual(values = c("#9D9D9C", "#A1D1B1", "#EA83B3", "#1D71B8", "#E94E1B", "#662483", "#006633", "#29235C", "#DEDC00", "#B17F4A")) +
  theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.3.cancer.donut

#
cluster.4.cancer.donut <- cluster.4.cancer.count %>% mutate(x = hsize) %>% 
  ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
  geom_col(color = "black", alpha = 0.6) +
  geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
  coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
  ggtitle("cluster 4") +
  scale_fill_manual(values = c("#9B7277")) +
  theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.4.cancer.donut

#
cluster.5.cancer.donut <- cluster.5.cancer.count %>% mutate(x = hsize) %>% 
  ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
  geom_col(color = "black", alpha = 0.6) +
  geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
  coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
  ggtitle("cluster 5") +
  scale_fill_manual(values = c("#9D9D9C", "#683C11", "#A1D1B1", "#EA83B3", "#073F60", "#1D71B8", "#BE1622", "#F9B233", "#E94E1B", "#662483", "#006633", "#DEDC00", "#B17F4A")) +
    theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.5.cancer.donut

pdf("./Rplots/snvs.sig.cluster.donuts.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.1.cancer.donut, cluster.2.cancer.donut, cluster.3.cancer.donut, cluster.4.cancer.donut, cluster.5.cancer.donut, nrow = 2, ncol = 3)
dev.off()  
## quartz_off_screen 
##                 2

2.2.2 Aggregated mutational signatures

Compute aggregated mutational signatures for each clusters

# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load mutational matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Sum mutation counts per clusters
cluster.1.id <- (clusters.df %>% filter(cluster == 1))$donor.id
  snvs.ori.mut.cluster.1 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.1.id)) %>% mutate(cluster.1 = rowSums(.)) %>% dplyr::select(cluster.1)
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id
  snvs.ori.mut.cluster.2 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.id)) %>% mutate(cluster.2 = rowSums(.)) %>% dplyr::select(cluster.2)
cluster.3.id <- (clusters.df %>% filter(cluster == 3))$donor.id
  snvs.ori.mut.cluster.3 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.3.id)) %>% mutate(cluster.3 = rowSums(.)) %>% dplyr::select(cluster.3)
cluster.4.id <- (clusters.df %>% filter(cluster == 4))$donor.id
  snvs.ori.mut.cluster.4 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.4.id)) %>% mutate(cluster.4 = rowSums(.)) %>% dplyr::select(cluster.4)
cluster.5.id <- (clusters.df %>% filter(cluster == 5))$donor.id
  snvs.ori.mut.cluster.5 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.id)) %>% mutate(cluster.5 = rowSums(.)) %>% dplyr::select(cluster.5)
  
# Combine information
snvs.ori.mut.cluster <- cbind.data.frame(snvs.ori.mut.cluster.1, snvs.ori.mut.cluster.2, snvs.ori.mut.cluster.3,
                                         snvs.ori.mut.cluster.4, snvs.ori.mut.cluster.5)
snvs.ori.mut.cluster <- as.matrix(snvs.ori.mut.cluster)
saveRDS(snvs.ori.mut.cluster, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# Plot profiles
plot_96_profile(snvs.ori.mut.cluster, condensed = TRUE, ymax = 0.24)

pdf("./Rplots/snvs.ori.mut.cluster.pdf", width=5, height=6, useDingbats=FALSE)
plot_96_profile(snvs.ori.mut.cluster, condensed = TRUE, ymax = 0.24)
dev.off() 
## quartz_off_screen 
##                 2

Assess the distribution of number of mutations within clusters

# r
library(dplyr)
library(MutationalPatterns)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load snvs count summary
snvs.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.summary.df.rds")
# Combine
clusters.snvs.count.df <- clusters.df %>% left_join(snvs.summary.df, by = "donor.id")
# Plot
clusters.snvs.count.plot <- clusters.snvs.count.df %>% filter(snvs.count >= 50) %>% 
  ggplot(aes(x=as.character(cluster), y=snvs.count, fill="#E41F1A", alpha = 0.3)) +
  geom_boxplot(outlier.shape = NA, width = 0.5) +
  geom_jitter(color="black", size=0.4, alpha=0.4) + ylab("snvs count at origins") +
  scale_y_continuous(trans = log10_trans(), limits=c(50,1500)) +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
clusters.snvs.count.plot

pdf("./Rplots/clusters.snvs.count.plot.pdf", width=7, height=6, useDingbats=FALSE)
clusters.snvs.count.plot
dev.off()
## quartz_off_screen 
##                 2

Assess the distribution of the ratio of snvs at origins vs flanks for each cluster

# r
library(dplyr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
colnames(snvs.dens.ratio.df)[1] <- "donor.id"
# Combine information
clusters.density.df <- clusters.df %>% left_join(snvs.dens.ratio.df, by = "donor.id")
# Plot
clusters.density.plot <- clusters.density.df %>% 
  ggplot(aes(x=as.character(cluster), y=snvs.dens.ratio, fill="#E41F1A", alpha = 0.3)) +
  geom_boxplot(outlier.shape = NA, width = 0.5) +
  geom_jitter(color="black", size=0.4, alpha=0.4) + ylab("Mutation density ratio (ori/flanks)") +
  geom_hline(yintercept=1, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
clusters.density.plot

pdf("./Rplots/clusters.density.plot.pdf", width=7, height=6, useDingbats=FALSE)
clusters.density.plot
dev.off()
## quartz_off_screen 
##                 2

2.2.3 Cluster 5 subclusters

Characterise cluster 5 subclusters

# R
library(tidyverse)
library(dendextend)
library(tibble)
library(gplots)
library(rlist)
library(MutationalPatterns)

setwd("~/Desktop/Projects/OriCan")

# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Compute cosine similarity matrix
snvs.cos.sim.mat <- cos_sim_matrix(snvs.diff.mut.mat, snvs.diff.mut.mat)
# cluster samples based on euclidean distance between relative contribution
snvs.cos.sim.hc <- hclust(dist(snvs.cos.sim.mat), method = "complete")
# order samples according to clustering
row.order <- rownames(snvs.cos.sim.mat)[snvs.cos.sim.hc$order]
# Generate dendrogram
snvs.cos.sim.dend <- as.dendrogram(snvs.cos.sim.hc)
# Plot original clusters
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")
# Split cluster 5
plot(color_branches(snvs.cos.sim.dend, k=11, groupLabels=T),leaflab="none")
# Define subclusters
# cluster 5-1, join cluster 7 and 8
# cluster 5-2, cluster 9
# cluster 5-3, cluster 10
# cluster 5-4, cluster 11
# Define and cut clusters
mut.sig.sub.clusters <- cutree(snvs.cos.sim.dend, k=11, order_clusters_as_data = FALSE)
table(mut.sig.sub.clusters)
# Generate df reporting the information about genomes and their cluster assignments
sub.clusters.df <- data.frame(donor.id = names(mut.sig.sub.clusters), cluster = mut.sig.sub.clusters)
# Define cluster 5 sub-clusters
cluster.5.1.id <- (sub.clusters.df %>% filter(cluster == "7" | cluster == "8"))$donor.id
cluster.5.2.id <- (sub.clusters.df %>% filter(cluster == "9"))$donor.id
cluster.5.3.id <- (sub.clusters.df %>% filter(cluster == "10"))$donor.id
cluster.5.4.id <- (sub.clusters.df %>% filter(cluster == "11"))$donor.id

# Compute mutational signatures for sub-clusters
cluster.5.1.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.1.id)) %>% mutate(cluster.5.1 = rowSums(.)) %>% dplyr::select(cluster.5.1) 
cluster.5.2.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.2.id)) %>% mutate(cluster.5.2 = rowSums(.)) %>% dplyr::select(cluster.5.2) 
cluster.5.3.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.3.id)) %>% mutate(cluster.5.3 = rowSums(.)) %>% dplyr::select(cluster.5.3) 
cluster.5.4.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.4.id)) %>% mutate(cluster.5.4 = rowSums(.)) %>% dplyr::select(cluster.5.4) 

# Combine signatures
cluster.5.sub.clusters.mut <- cbind.data.frame(cluster.5.1.mut, cluster.5.2.mut, cluster.5.3.mut, cluster.5.4.mut)

saveRDS(cluster.5.sub.clusters.mut, "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/cluster.5.sub.clusters.mut.rds")

Plot signatures

# r
library(dplyr)
library(MutationalPatterns)

# Load cluster information
cluster.5.sub.clusters.mut <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/cluster.5.sub.clusters.mut.rds")

plot_96_profile(cluster.5.sub.clusters.mut, condensed = TRUE, ymax = 0.45)

pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/cluster.5.sub.clusters.signatures.pdf", width=5, height=4, useDingbats=FALSE)
plot_96_profile(cluster.5.sub.clusters.mut, condensed = TRUE, ymax = 0.45)
dev.off() 
## quartz_off_screen 
##                 2

2.3 Origin-associated signature exposure

2.3.1 Compute mutation count matrices

Mutations from genomes of each cluster are combined and spatially resolved in bins of 100 nt around origins. The extracted signatures are then fitted to the mutation count matrices in order to characterise their spatial contribution.

Combine vcf files

# Cluster 1 - skin melanomas
# Recover donor id, file names and paths
cluster.1.donor.id <- (clusters.cancer.df %>% filter(cluster == 1))$donor.id
cluster.1.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.1.donor.id))$file.name # 42 genomes
cluster.1.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.1.vcf.name, sep = "")
# Combine vcf files
cluster.1.snvs.df <- tibble()
for (i in 1:length(cluster.1.vcf.path)) {
  print(i/length(cluster.1.vcf.path))
  vcf.i <- read.table(gzfile(cluster.1.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  cluster.1.snvs.df <- rbind(cluster.1.snvs.df, vcf.i)
}
write.table(cluster.1.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# Cluster 2 - diverse genomes
# Recover donor id, file names and paths
cluster.2.donor.id <- (clusters.cancer.df %>% filter(cluster == 2))$donor.id
cluster.2.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.2.donor.id))$file.name # 46 genomes
cluster.2.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.2.vcf.name, sep = "")
# Combine vcf files
cluster.2.snvs.df <- tibble()
for (i in 1:length(cluster.2.vcf.path)) {
  print(i/length(cluster.2.vcf.path))
  vcf.i <- read.table(gzfile(cluster.2.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  cluster.2.snvs.df <- rbind(cluster.2.snvs.df, vcf.i)
}
write.table(cluster.2.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# Cluster 3 - diverse genomes
# Recover donor id, file names and paths
cluster.3.donor.id <- (clusters.cancer.df %>% filter(cluster == 3))$donor.id
cluster.3.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.3.donor.id))$file.name # 29 genomes
cluster.3.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.3.vcf.name, sep = "")
# Combine vcf files
cluster.3.snvs.df <- tibble()
for (i in 1:length(cluster.3.vcf.path)) {
  print(i/length(cluster.3.vcf.path))
  vcf.i <- read.table(gzfile(cluster.3.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  cluster.3.snvs.df <- rbind(cluster.3.snvs.df, vcf.i)
}
write.table(cluster.3.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# Cluster 4 - chronic myeloid disorders
# Recover donor id, file names and paths
cluster.4.donor.id <- (clusters.cancer.df %>% filter(cluster == 4))$donor.id
cluster.4.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.4.donor.id))$file.name # 23 genomes
cluster.4.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.4.vcf.name, sep = "")
# Combine vcf files
cluster.4.snvs.df <- tibble()
for (i in 1:length(cluster.4.vcf.path)) {
  print(i/length(cluster.4.vcf.path))
  vcf.i <- read.table(gzfile(cluster.4.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  cluster.4.snvs.df <- rbind(cluster.4.snvs.df, vcf.i)
}
write.table(cluster.4.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# Cluster 5 - diverse genomes
# Recover donor id, file names and paths
cluster.5.donor.id <- (clusters.cancer.df %>% filter(cluster == 5))$donor.id
cluster.5.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.5.donor.id))$file.name # 93 genomes
cluster.5.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.5.vcf.name, sep = "")
# Combine vcf files
cluster.5.snvs.df <- tibble()
for (i in 1:length(cluster.5.vcf.path)) {
  print(i/length(cluster.5.vcf.path))
  vcf.i <- read.table(gzfile(cluster.5.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  cluster.5.snvs.df <- rbind(cluster.5.snvs.df, vcf.i)
}
write.table(cluster.5.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

Compute snvs-origin distances and filter mutations at less than 10kb of an origin

# bash/SLURM

# Cluster 1 - skin melanomas
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed \
  > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed \
  -b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
  > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed \
  > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed

# Cluster 2 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed

# Cluster 3 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed

# Cluster 4 - chronic myeloid disorders
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed

# Cluster 5 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed

Split bed files according to origin distances

# r

# Cluster 1 - skin melanomas
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
cluster.1.snvs.10kb.df <- cluster.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.1.snvs.10kb.df$dist)) {
  cluster.1.vcf <- cluster.1.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(cluster.1.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/Bin_",i,".bed")
  write.table(cluster.1.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# Cluster 2 - diverse genomes
cluster.2.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.10kb.bed", sep = "\t")
cluster.2.snvs.10kb.df <- cluster.2.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.2.snvs.10kb.df$dist)) {
  cluster.2.vcf <- cluster.2.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(cluster.2.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/Bin_",i,".bed")
  write.table(cluster.2.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# Cluster 3 - diverse genomes
cluster.3.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.10kb.bed", sep = "\t")
cluster.3.snvs.10kb.df <- cluster.3.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.3.snvs.10kb.df$dist)) {
  cluster.3.vcf <- cluster.3.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(cluster.3.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/Bin_",i,".bed")
  write.table(cluster.3.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# Cluster 4 - chronic myeloid disorders
cluster.4.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.10kb.bed", sep = "\t")
cluster.4.snvs.10kb.df <- cluster.4.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.4.snvs.10kb.df$dist)) {
  cluster.4.vcf <- cluster.4.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(cluster.4.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/Bin_",i,".bed")
  write.table(cluster.4.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# Cluster 5 - diverse genomes
cluster.5.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.10kb.bed", sep = "\t")
cluster.5.snvs.10kb.df <- cluster.5.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.5.snvs.10kb.df$dist)) {
  cluster.5.vcf <- cluster.5.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(cluster.5.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/Bin_",i,".bed")
  write.table(cluster.5.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

Convert to vcf format

# bash/SLURM

# Cluster 1 - skin melanomas
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1

# Cluster 2 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2

# Cluster 3 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3

# Cluster 4 - chronic myeloid disorders
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4

# Cluster 5 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5

Extract mutational count matrices

# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19

# Cluster 1 - skin melanomas
vcfFiles.cluster.1 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/"), value = TRUE)
cluster.1.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/", vcfFiles.cluster.1)
cluster.1.sample.names <- vcfFiles.cluster.1 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.1.snvs.grl <- read_vcfs_as_granges(cluster.1.files, cluster.1.sample.names, ref.genome)
saveRDS(cluster.1.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")

# Cluster 2 - diverse genomes
vcfFiles.cluster.2 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/"), value = TRUE)
cluster.2.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/", vcfFiles.cluster.2)
cluster.2.sample.names <- vcfFiles.cluster.2 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.2.snvs.grl <- read_vcfs_as_granges(cluster.2.files, cluster.2.sample.names, ref.genome)
saveRDS(cluster.2.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")

# Cluster 3 - diverse genomes
vcfFiles.cluster.3 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/"), value = TRUE)
cluster.3.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/", vcfFiles.cluster.3)
cluster.3.sample.names <- vcfFiles.cluster.3 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.3.snvs.grl <- read_vcfs_as_granges(cluster.3.files, cluster.3.sample.names, ref.genome)
saveRDS(cluster.3.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.snvs.grl.rds")

# Cluster 4 - chronic myeloid disorders
vcfFiles.cluster.4 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/"), value = TRUE)
cluster.4.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/", vcfFiles.cluster.4)
cluster.4.sample.names <- vcfFiles.cluster.4 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.4.snvs.grl <- read_vcfs_as_granges(cluster.4.files, cluster.4.sample.names, ref.genome)
saveRDS(cluster.4.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")

# Cluster 5 - diverse genomes
vcfFiles.cluster.5 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/"), value = TRUE)
cluster.5.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/", vcfFiles.cluster.5)
cluster.5.sample.names <- vcfFiles.cluster.5 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.5.snvs.grl <- read_vcfs_as_granges(cluster.5.files, cluster.5.sample.names, ref.genome)
saveRDS(cluster.5.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.snvs.grl.rds")

# Load grange lists
cluster.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")
cluster.3.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.3.snvs.grl.rds")
cluster.4.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")
cluster.5.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.5.snvs.grl.rds")

# Compute mutation count matrices
cluster.1.mut.mat <- mut_matrix(vcf_list = cluster.1.snvs.grl, ref_genome = ref.genome)
cluster.2.mut.mat <- mut_matrix(vcf_list = cluster.2.snvs.grl, ref_genome = ref.genome)
cluster.3.mut.mat <- mut_matrix(vcf_list = cluster.3.snvs.grl, ref_genome = ref.genome)
cluster.4.mut.mat <- mut_matrix(vcf_list = cluster.4.snvs.grl, ref_genome = ref.genome)
cluster.5.mut.mat <- mut_matrix(vcf_list = cluster.5.snvs.grl, ref_genome = ref.genome)

# Save
saveRDS(cluster.1.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
saveRDS(cluster.2.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
saveRDS(cluster.3.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
saveRDS(cluster.4.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
saveRDS(cluster.5.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")

2.3.2 Local exposure

# r

# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")

# Load identified signatures

snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# Cluster 1 - skin melanomas
cluster.1.fit <- fit_to_signatures(cluster.1.mut.mat, snvs.ori.mut.cluster)
cluster.1.fit.df <- as.data.frame(cluster.1.fit$contribution)
cluster.1.fit.df <- as.data.frame(t(cluster.1.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.1 = cluster.1) %>% mutate(SBS.clust.1.FC = SBS.clust.1/mean(SBS.clust.1[c(1:10,190:200)]))
cluster.1.fit.df$dist <- as.numeric(cluster.1.fit.df$dist)
cluster.1.norm.fit.df <- cluster.1.fit.df  %>% dplyr::select(dist, value = SBS.clust.1.FC) %>% mutate(cluster = "cluster.1")

# Cluster 2 - diverse genomes
cluster.2.fit <- fit_to_signatures(cluster.2.mut.mat, snvs.ori.mut.cluster)
cluster.2.fit.df <- as.data.frame(cluster.2.fit$contribution)
cluster.2.fit.df <- as.data.frame(t(cluster.2.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.2 = cluster.2) %>% mutate(SBS.clust.2.FC = (SBS.clust.2+1)/mean(SBS.clust.2[c(1:10,190:200)]))
cluster.2.fit.df$dist <- as.numeric(cluster.2.fit.df$dist)
cluster.2.norm.fit.df <- cluster.2.fit.df  %>% dplyr::select(dist, value = SBS.clust.2.FC) %>% mutate(cluster = "cluster.2")

# Cluster 3 - diverse genomes
cluster.3.fit <- fit_to_signatures(cluster.3.mut.mat, snvs.ori.mut.cluster)
cluster.3.fit.df <- as.data.frame(cluster.3.fit$contribution)
cluster.3.fit.df <- as.data.frame(t(cluster.3.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.3 = cluster.3) %>% mutate(SBS.clust.3.FC = SBS.clust.3/mean(SBS.clust.3[c(1:10,190:200)]))
cluster.3.fit.df$dist <- as.numeric(cluster.3.fit.df$dist)
cluster.3.norm.fit.df <- cluster.3.fit.df  %>% dplyr::select(dist, value = SBS.clust.3.FC) %>% mutate(cluster = "cluster.3")

# Cluster 4 - chronic myeloid disorders
cluster.4.fit <- fit_to_signatures(cluster.4.mut.mat, snvs.ori.mut.cluster)
cluster.4.fit.df <- as.data.frame(cluster.4.fit$contribution)
cluster.4.fit.df <- as.data.frame(t(cluster.4.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.4 = cluster.4) %>% mutate(SBS.clust.4.FC = (SBS.clust.4+1)/mean(SBS.clust.4[c(1:10,190:200)]))
cluster.4.fit.df$dist <- as.numeric(cluster.4.fit.df$dist)
cluster.4.norm.fit.df <- cluster.4.fit.df  %>% dplyr::select(dist, value = SBS.clust.4.FC) %>% mutate(cluster = "cluster.4")

# Cluster 5 - diverse genomes
cluster.5.fit <- fit_to_signatures(cluster.5.mut.mat, snvs.ori.mut.cluster)
cluster.5.fit.df <- as.data.frame(cluster.5.fit$contribution)
cluster.5.fit.df <- as.data.frame(t(cluster.5.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.5 = cluster.5) %>% mutate(SBS.clust.5.FC = SBS.clust.5/mean(SBS.clust.5[c(1:10,190:200)]))
cluster.5.fit.df$dist <- as.numeric(cluster.5.fit.df$dist)
cluster.5.norm.fit.df <- cluster.5.fit.df  %>% dplyr::select(dist, value = SBS.clust.5.FC) %>% mutate(cluster = "cluster.5")

# Combine results and plot

cluster.summary.norm.fit.df <- rbind(cluster.1.norm.fit.df, cluster.2.norm.fit.df, cluster.3.norm.fit.df, cluster.4.norm.fit.df, cluster.5.norm.fit.df)
saveRDS(cluster.summary.norm.fit.df, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.summary.norm.fit.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load results
cluster.summary.norm.fit.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.summary.norm.fit.df.rds")

# Plot

cluster.summary.norm.fit.plot.1 <- cluster.summary.norm.fit.df %>% filter(cluster == "cluster.1" | cluster == "cluster.3" | cluster == "cluster.5") %>%
  ggplot(aes(x = dist, y = value, color = cluster)) +
  geom_line() + xlim(-5000,5000) + ylim(0.5, 1.5) +
  scale_color_manual(values = c("#0173B4", "#069F73", "#56B3E6")) +
  geom_hline(yintercept=1, linetype="dashed") +
  xlab("Distance from origins (bp)") + ylab("Signature contribution\n(fold change from background)") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.summary.norm.fit.plot.1

cluster.summary.norm.fit.plot.2 <- cluster.summary.norm.fit.df %>% filter(cluster == "cluster.2" | cluster == "cluster.4") %>%
  ggplot(aes(x = dist, y = value, color = cluster)) +
  geom_line() + xlim(-5000,5000) + ylim(0, 10) +
  scale_color_manual(values = c("#D56014", "#E6A004")) +
  geom_hline(yintercept=1, linetype="dashed") +
  xlab("Distance from origins (bp)") + ylab("Signature contribution\n(fold change from background)") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.summary.norm.fit.plot.2

pdf("./Rplots/cluster.summary.norm.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.summary.norm.fit.plot.2, cluster.summary.norm.fit.plot.1, nrow = 1, ncol = 2)
dev.off() 
## quartz_off_screen 
##                 2

2.3.3 Origin footprint

In order to appreciate the spatial exposure of the identified mutational processes, we overlap their signal with the footprint of origins.

# r

# Prepare origin coordinates
ori.inter.dist.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t")
colnames(ori.inter.dist.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")

# Rescale to 10kb origin domains
ori.rescale.ir <- IRanges(start = round(10000-(ori.inter.dist.df$ori.width/2)), width = ori.inter.dist.df$ori.width)
# Add fill range for baseline
baseline <- IRanges(start=0, width = 20000)
ori.rescale.ir <- c(ori.rescale.ir, baseline)
# Compute coverage and prepare dataframe
ori.cov <- coverage(ori.rescale.ir)
ori.cov.df <- cbind.data.frame(dist = seq(-9999,9999,1), ori.cov = as.vector(ori.cov)) %>% mutate(ori.dens = ori.cov/max(ori.cov))
  
ori.cov.plot <- ori.cov.df %>% ggplot(aes(x=dist, y=ori.dens)) +
  geom_area(fill="#999999", alpha=0.4) +
  geom_line(color="black", size=0.5) + xlim(-5000, 5000) +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
ori.cov.plot
  
pdf("./Rplots/ori.cov.plot.pdf", width=7, height=6, useDingbats=FALSE)
ori.cov.plot
dev.off()

2.4 Genome-wide mutation signatures of identified clusters

Compute aggregated mutational signatures for each cluster

As a control, the genome-side signatures from genomes of each cluster is extracted.

Prepare snsvs gr lists per cluster.

# r
library(dplyr)
library(stringr)
library(BSgenome.Hsapiens.UCSC.hg19)
# Open manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10, V2, V3, V5)
colnames(SSM.manifest.df) <- c("donor.id", "project", "file.id", "object.id", "file.name")
snvs.manifest.df <- SSM.manifest.df %>% filter(str_detect(file.name, 'somatic.snv')) %>% separate(project, c("cancer.type", "project"), sep = "-")
# 1,950 entries, 1,830 donors
# Recover genomes to analyse
cluster.1.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.1.id) # 42 genomes
cluster.2.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.2.id) # 46 genomes
cluster.3.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.3.id) # 29 genomes
cluster.4.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.4.id) # 23 genomes
cluster.5.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.5.id) # 93 genomes
# List vcf files
vcf.files <- list.files("./Dataset/ICGC/SSM/VCF/") # 3,892 files
vcf.files <- vcf.files[!grepl(".idx", vcf.files)]  # 1,946 snvs vcf files
# per cluster
vcf.files.cluster.1 <- vcf.files[vcf.files %in% cluster.1.vcf.df$file.name] # 42 files
vcf.files.cluster.2 <- vcf.files[vcf.files %in% cluster.2.vcf.df$file.name] # 46 files
vcf.files.cluster.3 <- vcf.files[vcf.files %in% cluster.3.vcf.df$file.name] # 29 files
vcf.files.cluster.4 <- vcf.files[vcf.files %in% cluster.4.vcf.df$file.name] # 23 files
vcf.files.cluster.5 <- vcf.files[vcf.files %in% cluster.5.vcf.df$file.name] # 93 files
# define path
vcf.path.cluster.1 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.1, sep = "")
vcf.path.cluster.2 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.2, sep = "")
vcf.path.cluster.3 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.3, sep = "")
vcf.path.cluster.4 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.4, sep = "")
vcf.path.cluster.5 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.5, sep = "")
# define path
vcf.name.cluster.1 <- cluster.1.vcf.df$donor.id
vcf.name.cluster.2 <- cluster.2.vcf.df$donor.id
vcf.name.cluster.3 <- cluster.3.vcf.df$donor.id
vcf.name.cluster.4 <- cluster.4.vcf.df$donor.id
vcf.name.cluster.5 <- cluster.5.vcf.df$donor.id
# Load vcf files in a gr lists
ref.genome <- BSgenome.Hsapiens.UCSC.hg19 
cluster.1.grl <- read_vcfs_as_granges(vcf.path.cluster.1, vcf.name.cluster.1, ref.genome) # xx vcf
cluster.2.grl <- read_vcfs_as_granges(vcf.path.cluster.2, vcf.name.cluster.2, ref.genome) # xx vcf
cluster.3.grl <- read_vcfs_as_granges(vcf.path.cluster.3, vcf.name.cluster.3, ref.genome) # xx vcf
cluster.4.grl <- read_vcfs_as_granges(vcf.path.cluster.4, vcf.name.cluster.4, ref.genome) # xx vcf
cluster.5.grl <- read_vcfs_as_granges(vcf.path.cluster.5, vcf.name.cluster.5, ref.genome) # xx vcf
# Save grange list
saveRDS(cluster.1.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.grl.rds")
saveRDS(cluster.2.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.grl.rds")
saveRDS(cluster.3.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.3.grl.rds")
saveRDS(cluster.4.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.4.grl.rds")
saveRDS(cluster.5.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.5.grl.rds")
# Compute mutation matrices
cluster.1.mut.mat <- mut_matrix(vcf_list = unlist(cluster.1.grl), ref_genome = ref.genome)
cluster.2.mut.mat <- mut_matrix(vcf_list = unlist(cluster.2.grl), ref_genome = ref.genome)
cluster.3.mut.mat <- mut_matrix(vcf_list = unlist(cluster.3.grl), ref_genome = ref.genome)
cluster.4.mut.mat <- mut_matrix(vcf_list = unlist(cluster.4.grl), ref_genome = ref.genome)
cluster.5.mut.mat <- mut_matrix(vcf_list = unlist(cluster.5.grl), ref_genome = ref.genome)
# Modify and combine
colnames(cluster.1.mut.mat) <- c("cluster.1")
colnames(cluster.2.mut.mat) <- c("cluster.2")
colnames(cluster.3.mut.mat) <- c("cluster.3")
colnames(cluster.4.mut.mat) <- c("cluster.4")
colnames(cluster.5.mut.mat) <- c("cluster.5")
cluster.summary.mut.mat <- cbind.data.frame(cluster.1.mut.mat, cluster.2.mut.mat, cluster.3.mut.mat, cluster.4.mut.mat, cluster.5.mut.mat)
# Save mutation matrix
saveRDS(cluster.summary.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.summary.mut.mat.rds")
# Assess per genome, per cluster genome-wide mutational burden
# Cluster 1
gw.snvs.count.summary.cluster.1.df <- tibble()
for (i in 1:length(cluster.1.grl)) {
  sample.i <- names(cluster.1.grl@partitioning)[i]
  count.i <- length(unlist(start(cluster.1.grl[i])))
  gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 1)
  gw.snvs.count.summary.cluster.1.df <- rbind(gw.snvs.count.summary.cluster.1.df, gw.summary.i)
}
# Cluster 2
gw.snvs.count.summary.cluster.2.df <- tibble()
for (i in 1:length(cluster.2.grl)) {
  sample.i <- names(cluster.2.grl@partitioning)[i]
  count.i <- length(unlist(start(cluster.2.grl[i])))
  gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 2)
  gw.snvs.count.summary.cluster.2.df <- rbind(gw.snvs.count.summary.cluster.2.df, gw.summary.i)
}
# Cluster 3
gw.snvs.count.summary.cluster.3.df <- tibble()
for (i in 1:length(cluster.3.grl)) {
  sample.i <- names(cluster.3.grl@partitioning)[i]
  count.i <- length(unlist(start(cluster.3.grl[i])))
  gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 3)
  gw.snvs.count.summary.cluster.3.df <- rbind(gw.snvs.count.summary.cluster.3.df, gw.summary.i)
}
# Cluster 4
gw.snvs.count.summary.cluster.4.df <- tibble()
for (i in 1:length(cluster.4.grl)) {
  sample.i <- names(cluster.4.grl@partitioning)[i]
  count.i <- length(unlist(start(cluster.4.grl[i])))
  gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 4)
  gw.snvs.count.summary.cluster.4.df <- rbind(gw.snvs.count.summary.cluster.4.df, gw.summary.i)
}
# Cluster 5
gw.snvs.count.summary.cluster.5.df <- tibble()
for (i in 1:length(cluster.5.grl)) {
  sample.i <- names(cluster.5.grl@partitioning)[i]
  count.i <- length(unlist(start(cluster.5.grl[i])))
  gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 5)
  gw.snvs.count.summary.cluster.5.df <- rbind(gw.snvs.count.summary.cluster.5.df, gw.summary.i)
}
gw.snvs.count.summary.cluster.df <- rbind(gw.snvs.count.summary.cluster.1.df, gw.snvs.count.summary.cluster.2.df,
                                          gw.snvs.count.summary.cluster.3.df, gw.snvs.count.summary.cluster.4.df,
                                          gw.snvs.count.summary.cluster.5.df)
# Save snvs count summary
saveRDS(gw.snvs.count.summary.cluster.df, file = "./002_Mut_signatures_Pan_cancer/rds/gw.snvs.count.summary.cluster.df.rds")

Plot genome-wide signatures and mutational burden.

# r
library(dplyr)
library(MutationalPatterns)
library(ggplot2)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load mutation matrix
cluster.summary.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.summary.mut.mat.rds")

# Plot 96 profiles
plot_96_profile(cluster.summary.mut.mat, condensed = TRUE, ymax = 0.3)

# Save profiles
pdf("./Rplots/snvs.gw.mut.cluster.pdf", width=5, height=6, useDingbats=FALSE)
plot_96_profile(cluster.summary.mut.mat, condensed = TRUE, ymax = 0.3)
dev.off()
## quartz_off_screen 
##                 2
# Compute cosine similarities between origin and genome-wide signatures

# Load origin associated signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# Compute cosine similarites
cos_sim(snvs.ori.mut.cluster[, 1], cluster.summary.mut.mat[, 1]) # 0.8561225
## [1] 0.8561225
cos_sim(snvs.ori.mut.cluster[, 2], cluster.summary.mut.mat[, 2]) # 0.2775516
## [1] 0.2775516
cos_sim(snvs.ori.mut.cluster[, 3], cluster.summary.mut.mat[, 3]) # 0.5507919
## [1] 0.5507919
cos_sim(snvs.ori.mut.cluster[, 4], cluster.summary.mut.mat[, 4]) # 0.4684654
## [1] 0.4684654
cos_sim(snvs.ori.mut.cluster[, 5], cluster.summary.mut.mat[, 5]) # 0.7121845
## [1] 0.7121845

2.5 Comparison of origin-associated signatures with known signatures in cancers

2.5.1 Cosine similarity with COSMIC signatures

# Recover known signatures (COSMIC signatures)
COSMIC.signatures <- as.matrix(read.table("./Dataset/COSMIC/COSMIC_v3.2_SBS_GRCh37.txt", header = T, row.names = 1))
# Reorder rows
new_order <- sapply(rownames(snvs.ori.mut.cluster), function(x,df){which(rownames(COSMIC.signatures) == x)}, df=COSMIC.signatures)
COSMIC.signatures <- COSMIC.signatures[new_order,]
# Compute cosine similarity matrix
snvs.ori.COSMIC.sim.mat <- cos_sim_matrix(COSMIC.signatures, snvs.ori.mut.cluster)
snvs.ori.COSMIC.sim.mat.melt <- melt(snvs.ori.COSMIC.sim.mat)
# Save results
saveRDS(snvs.ori.COSMIC.sim.mat.melt, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.COSMIC.sim.mat.melt.rds")

Plot

# r
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cosine similarity matrix
snvs.ori.COSMIC.sim.mat.melt <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.COSMIC.sim.mat.melt.rds")
# Plot
snvs.ori.COSMIC.sim.heatmap <- snvs.ori.COSMIC.sim.mat.melt %>% ggplot(aes(x=Var2, y=Var1, fill=value)) + geom_tile(color = "white") +
  scale_fill_gradient2(low = "#4F4A75", high = "#E41F1A", mid = "white", midpoint = 0.4, limit = c(0,1), space = "Lab", name="Cosine\nSimilarity") +
  theme_minimal() + theme(aspect.ratio=10, axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1), axis.text.y = element_text(size = 12), axis.title.x=element_blank(), axis.title.y=element_blank())
snvs.ori.COSMIC.sim.heatmap

pdf("./Rplots/snvs.ori.COSMIC.sim.heatmap.pdf", width=3, height=10, useDingbats=FALSE)
snvs.ori.COSMIC.sim.heatmap
dev.off() 
## quartz_off_screen 
##                 2

The analysis shows that: - cluster 1 has high similarity with SBS7b (cos = 0.835) and SBS7a (cos = 0.793) associated with UV light exposure. This suggests that mutagenesis at origins in skin melanoma is due to defective NER. - SBS43 is the signature with highest similarity to cluster 2 (cos = 0.945). This is a signature of unknown etiology. This may reflect an unknown mutational process operating at origins. - cluster 3 has high similarity with SBS10b (cos = 0.642) associated with mutations within polymerase epsilon exonuclease domain. - cluster 4 has high similarity with SBS84 (cos = 0.877) reflecting the activity of activation-induced cytidine deaminase (AID). This suggests that AID hyperactivity in lymphomas focuses mutagenesis at origins. - cluster 5 has high similarity with SBS5 (cos = 0.846) which is a flat and clock-like signature of unknown etiology. This cluster may then reflect background mutagenesis.

2.5.2 COSMIC signatures refitting

Assess the local exposure of relevant COSMIC signatures. COSMIC signatures with incompatible etiology are excluded (e.g. chemo treatment).

Only signatures contributing for at least 50 snvs in a bin.

# r

# Load COSMIC signatures
COSMIC.signatures <- as.matrix(read.table("./Dataset/COSMIC/COSMIC_v3.2_SBS_GRCh37.txt", header = T, row.names = 1))
# Reorder rows
COSMIC.signatures <- COSMIC.signatures[rownames(snvs.ori.mut.cluster),,drop=FALSE]

# Select COSMIC signatures to exclude
COSMIC.ex <- c("SBS4", "SBS11", "SBS22", "SBS24", "SBS25", "SBS29", "SBS31", "SBS32", "SBS35", "SBS42", "SBS86", "SBS87", "SBS88", "SBS90")
COSMIC.filter <- colnames(COSMIC.signatures)[!colnames(COSMIC.signatures) %in% COSMIC.ex]
COSMIC.sigs <- as.data.frame(COSMIC.signatures) %>% dplyr::select(all_of(COSMIC.filter)) %>% as.matrix()

# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")

# Cluster 1 - skin melanomas
cluster.1.COSMIC.fit <- fit_to_signatures_strict(cluster.1.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.1.COSMIC.fit.df <- as.data.frame(cluster.1.COSMIC.fit$fit_res$contribution)
cluster.1.COSMIC.fit.df <- as.data.frame(t(cluster.1.COSMIC.fit.df))
cluster.1.COSMIC.fit.filter.df <- cluster.1.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.1.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.1.COSMIC.fit.filter.df.rds")

# Cluster 2 - diverse genomes
cluster.2.COSMIC.fit <- fit_to_signatures_strict(cluster.2.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.2.COSMIC.fit.df <- as.data.frame(cluster.2.COSMIC.fit$fit_res$contribution)
cluster.2.COSMIC.fit.df <- as.data.frame(t(cluster.2.COSMIC.fit.df))
cluster.2.COSMIC.fit.filter.df <- cluster.2.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.2.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.2.COSMIC.fit.filter.df.rds")

# Cluster 3 - diverse genomes
cluster.3.COSMIC.fit <- fit_to_signatures_strict(cluster.3.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.3.COSMIC.fit.df <- as.data.frame(cluster.3.COSMIC.fit$fit_res$contribution)
cluster.3.COSMIC.fit.df <- as.data.frame(t(cluster.3.COSMIC.fit.df))
cluster.3.COSMIC.fit.filter.df <- cluster.3.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.3.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.3.COSMIC.fit.filter.df.rds")

# Cluster 4 - malignant lymphomas
cluster.4.COSMIC.fit <- fit_to_signatures_strict(cluster.4.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.4.COSMIC.fit.df <- as.data.frame(cluster.4.COSMIC.fit$fit_res$contribution)
cluster.4.COSMIC.fit.df <- as.data.frame(t(cluster.4.COSMIC.fit.df))
cluster.4.COSMIC.fit.filter.df <- cluster.4.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.4.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.4.COSMIC.fit.filter.df.rds")

# Cluster 5  - diverse genomes
cluster.5.COSMIC.fit <- fit_to_signatures_strict(cluster.5.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.5.COSMIC.fit.df <- as.data.frame(cluster.5.COSMIC.fit$fit_res$contribution)
cluster.5.COSMIC.fit.df <- as.data.frame(t(cluster.5.COSMIC.fit.df))
cluster.5.COSMIC.fit.filter.df <- cluster.5.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.5.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.5.COSMIC.fit.filter.df.rds")

Plot exposures

# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Cluster 1 - skin melanomas
cluster.1.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.COSMIC.fit.filter.df.rds")
cluster.1.COSMIC.fit.filter.plot <- cluster.1.COSMIC.fit.filter.df %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c("#E4221E", "#0775B6")) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 1") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.COSMIC.fit.filter.plot

# Cluster 2 - diverse genomes
cluster.2.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.COSMIC.fit.filter.df.rds")
cluster.2.COSMIC.fit.filter.plot <- cluster.2.COSMIC.fit.filter.df %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c("#4F4A75", "#E4221E", "#0775B6", "#D56114")) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 2") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.COSMIC.fit.filter.plot

# Cluster 3 - diverse genomes
cluster.3.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.3.COSMIC.fit.filter.df.rds")
c3.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#0072B2", "#D55E00")
cluster.3.COSMIC.fit.filter.plot <- cluster.3.COSMIC.fit.filter.df %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c3.pal) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 3") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.3.COSMIC.fit.filter.plot

# Cluster 4 - malignant lymphomas
cluster.4.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.4.COSMIC.fit.filter.df.rds")
c4.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#E4221E", "#0072B2", "#D55E00")
cluster.4.COSMIC.fit.filter.plot <- cluster.4.COSMIC.fit.filter.df %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c4.pal) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 4") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.4.COSMIC.fit.filter.plot

# Cluster 5 - diverse genomes
cluster.5.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.5.COSMIC.fit.filter.df.rds")
c5.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#E4221E", "#0072B2", "#D55E00")
# To many signature to plot, select only signatures with signal at origins
cluster.5.sel.COSMIC.sig <- cluster.5.COSMIC.fit.filter.df %>% filter(dist == 0 & value > 0)
cluster.5.COSMIC.fit.filter.plot <- cluster.5.COSMIC.fit.filter.df %>% 
  filter(sig %in% cluster.5.sel.COSMIC.sig$sig) %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c5.pal) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 5") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.5.COSMIC.fit.filter.plot

pdf("./Rplots/COSMIC.refit.pdf", width=14, height=8, useDingbats=FALSE)
ggarrange(cluster.1.COSMIC.fit.filter.plot, cluster.2.COSMIC.fit.filter.plot, cluster.3.COSMIC.fit.filter.plot, cluster.4.COSMIC.fit.filter.plot, cluster.5.COSMIC.fit.filter.plot, ncol = 3, nrow = 2)
dev.off()  
## quartz_off_screen 
##                 2

3 NER efficiency at origins in skin melanomas

Cluster 1 of mutational signatures suggest differential NER activity at replication origins compared to background.

In this section, we aim to characterise NER efficiency at origins and correlation with local variation in mutation burden.

NER efficiency is assessed using XR-seq data from Hu et al. GENES & DEVELOPMENT 2015 29:948–960 available at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67941 data is hg19

3.1 NER activity at origins

We first aim to demonstrate that origin sites are prone to damage induced by UV exposure as there are site of increased accessibility.

To assess accessibility we use DHS data from ENCODE for GM04504 cells (Normal Human Dermal Fibroblasts (NHDF)) recovered from https://www.encodeproject.org/experiments/ENCSR000EMP/ (replicate 1 - ENCAN140WTK).

NER activity is assessed from XR-seq signal at consitutive origins mapping two UV-induced DNA damages: cyclo-butane pyrimidine dimers (CPDs) and (6-4) pyrimidine–pyrimidone photoproducts [(6-4)PPs]. We use data from a CS-B (ERCC6) mutant NHF1 cell line to assess global NER rather than transcription-coupled NER. As XR-seq is a stranded technique we first considered aggregated data form plus and minus strands.

# r

# Load XR-seq data for CSB cells

# CPD
XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.CPD.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.CPD.minus.bw) <- "NCBI"

# 64PP
XRseq.64PP.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.64PP.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.64PP.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.64PP.minus.bw) <- "NCBI"

# Combine both plus and minus sequencing information

# CSB cells - CPD
XRseq.CPD.bw <- XRseq.CPD.plus.bw
XRseq.CPD.bw$plus <- XRseq.CPD.bw$score
XRseq.CPD.bw$minus <- XRseq.CPD.minus.bw$score
XRseq.CPD.bw$score <- XRseq.CPD.bw$plus + XRseq.CPD.bw$minus
export.bw(XRseq.CPD.bw, "./Dataset/Hu_2015/BW/XRseq.CSB.CPD.unstranded.bw")

# CSB cells - 64PP
XRseq.64PP.bw <- XRseq.64PP.plus.bw
XRseq.64PP.bw$plus <- XRseq.64PP.bw$score
XRseq.64PP.bw$minus <- XRseq.64PP.minus.bw$score
XRseq.64PP.bw$score <- XRseq.64PP.bw$plus + XRseq.64PP.bw$minus
export.bw(XRseq.64PP.bw, "./Dataset/Hu_2015/BW/XRseq.CSB.64PP.unstranded.bw")

# Association between NER activity and accessibility at origins

XRseq.CPD.bw <- import.bw("./Dataset/Hu_2015/BW/XRseq.CSB.CPD.unstranded.bw")
XRseq.64PP.bw <- import.bw("./Dataset/Hu_2015/BW/XRseq.CSB.64PP.unstranded.bw")

# Load DHS data

DHS.GM04504.bw <- import.bw("./Dataset/DHS/ENCFF228HOM_DHS_GM04504.bigWig")
seqlevelsStyle(DHS.GM04504.bw) <- "NCBI"

# Compute matrices

# For XRseq (CSB NHF1 cells - CPD)
XRseq.CSB.CPD.ori.mat <- normalizeToMatrix(XRseq.CPD.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(XRseq.CSB.CPD.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")

# For XRseq (CSB NHF1 cells - 64PP)
XRseq.CSB.64PP.ori.mat <- normalizeToMatrix(XRseq.64PP.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(XRseq.CSB.64PP.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.64PP.ori.mat.rds")

# For DHS
DHS.ori.mat <- normalizeToMatrix(DHS.GM04504.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(DHS.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/DHS.ori.mat.rds")

col.fun.CPD  <- colorRamp2(quantile(XRseq.CSB.CPD.ori.mat, c(0, 0.95)), c("white", "#0775B6"))
col.fun.64PP  <- colorRamp2(quantile(XRseq.CSB.64PP.ori.mat, c(0, 0.95)), c("white", "#0775B6"))
col.fun.DHS  <- colorRamp2(quantile(DHS.ori.mat, c(0, 0.95)), c("white", "#E4221E"))

pdf("./Rplots/XRseq.DHS.heatmaps.pdf", width=6, height=4, useDingbats=FALSE)
EnrichedHeatmap(XRseq.CSB.CPD.ori.mat, col = col.fun.CPD, name = "XRseq - CPD", pos_line = F) +
  EnrichedHeatmap(XRseq.CSB.64PP.ori.mat, col = col.fun.64PP, name = "XRseq - 64PP", pos_line = F) +
  EnrichedHeatmap(DHS.ori.mat, col = col.fun.DHS, name = "DHS", pos_line = F)
dev.off()

# The resulting pdf is subsequently modified and saved as a png
NER efficiency and DHS accessibility at origins

NER efficiency and DHS accessibility at origins

Assess correlation between DHS signal and NER activity

# r

# Assess correlation between DHS signal and NER activity

# Compute mean signals over origins (1 kb centered at origins - 10 bins)

XRseq.CSB.CPD.ori.val <- rowMeans(XRseq.CSB.CPD.ori.mat[,45:54], na.rm = T)
XRseq.CSB.64PP.ori.val <- rowMeans(XRseq.CSB.64PP.ori.mat[,45:54], na.rm = T)
DHS.ori.val <- rowMeans(DHS.ori.mat[,45:54], na.rm = T)
# Combine information, format and plot
XRseq.DHS.CPD.df <- cbind.data.frame(ori.id = ori.gr$ori.id, XRseq.CPD = XRseq.CSB.CPD.ori.val, XRseq.64PP = XRseq.CSB.64PP.ori.val, DHS = DHS.ori.val) %>% 
  mutate(DHS.bin = ntile(DHS, 20)) %>% group_by(DHS.bin) %>% 
  summarise(DHS = mean(DHS, na.rm = T), XRseq.mean = mean(XRseq.CPD, na.rm = T), XRseq.sem = std.error(XRseq.CPD, na.rm = T)) %>% mutate(exp = "CPD")
XRseq.DHS.64PP.df <- cbind.data.frame(ori.id = ori.gr$ori.id, XRseq.CPD = XRseq.CSB.CPD.ori.val, XRseq.64PP = XRseq.CSB.64PP.ori.val, DHS = DHS.ori.val) %>% 
  mutate(DHS.bin = ntile(DHS, 20)) %>% group_by(DHS.bin) %>% 
  summarise(DHS = mean(DHS, na.rm = T), XRseq.mean = mean(XRseq.64PP, na.rm = T), XRseq.sem = std.error(XRseq.64PP, na.rm = T)) %>% mutate(exp = "64PP")
XRseq.DHS.summary.df <- rbind(XRseq.DHS.CPD.df, XRseq.DHS.64PP.df)

# Assess corelations
cor.test(DHS.ori.val, XRseq.CSB.CPD.ori.val) # Rho = 0.4012731, p-value < 2.2e-16
cor.test(DHS.ori.val, XRseq.CSB.64PP.ori.val) # Rho = 0.1686478, p-value < 2.2e-16

# Save
saveRDS(XRseq.DHS.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.DHS.summary.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

XRseq.DHS.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.DHS.summary.df.rds")
XRseq.DHS.summary.plot <- XRseq.DHS.summary.df %>%
  ggplot(aes(x=DHS, y=XRseq.mean, color = exp)) +
  geom_line() +
  geom_errorbar(aes(ymin=XRseq.mean-XRseq.sem, ymax=XRseq.mean+XRseq.sem), width=.2, position=position_dodge(0.05)) +
  geom_point(shape = 21, size = 2, fill = "white") +
  scale_color_manual(values = c("#E41F1A", "#0474B5")) +
  scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  ylab("NER activity (normalised read count)") + xlab("DHS coverage bin (log10)") + ggtitle("Rho = 0.401, 0.168, p-val < 2.2e-16") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.DHS.summary.plot

pdf("./Rplots/XRseq.DHS.summary.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.DHS.summary.plot
dev.off()
## quartz_off_screen 
##                 2

3.2 NER efficiency at origins

Report NER efficiency at origins - we now consider strand specific data.

# r
library(dplyr)
library(tidyr)
library(rtracklayer)
library(EnrichedHeatmap)
library(ggplot2)
library(plotrix)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load origin center coordinates
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t",)
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)

# Load XR-seq data
# We use data from a CS-B (ERCC6) mutant NHF1 cell line to assess global NER rather than transcription-coupled NER
# from Hu et al. GENES & DEVELOPMENT 2015 29:948–960
# downloaded at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67941

XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.CPD.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.CPD.minus.bw) <- "NCBI"

XRseq.64PP.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.64PP.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.64PP.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.64PP.minus.bw) <- "NCBI"

# Compute NER activity at origins

# For CPD
XRseq.CPD.plus.ori.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.CPD.plus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")

XRseq.CPD.minus.ori.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.CPD.minus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")

# For 6-4PP
XRseq.64PP.plus.ori.mat <- normalizeToMatrix(XRseq.64PP.plus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.64PP.plus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.plus.ori.mat.rds")

XRseq.64PP.minus.ori.mat <- normalizeToMatrix(XRseq.64PP.minus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.64PP.minus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.minus.ori.mat.rds")

Plot

# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
XRseq.CPD.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")
XRseq.CPD.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")
XRseq.64PP.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.plus.ori.mat.rds")
XRseq.64PP.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.minus.ori.mat.rds")

# Format

# For CPD
XRseq.CPD.plus.ori.df <- as.data.frame(XRseq.CPD.plus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "plus")
XRseq.CPD.minus.ori.df <- as.data.frame(XRseq.CPD.minus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "minus")
XRseq.CPD.ori.df <- rbind(XRseq.CPD.plus.ori.df, XRseq.CPD.minus.ori.df)
# Compute strand bias
XRseq.CPD.sb.df <- (XRseq.CPD.plus.ori.df %>% dplyr::select(dist, plus = value)) %>% left_join((XRseq.CPD.minus.ori.df %>% dplyr::select(dist, minus = value)), by = "dist") %>% mutate(strand.bias = plus - minus)

# For (6-4)PP
XRseq.64PP.plus.ori.df <- as.data.frame(XRseq.64PP.plus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "plus")
XRseq.64PP.minus.ori.df <- as.data.frame(XRseq.64PP.minus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "minus")
XRseq.64PP.ori.df <- rbind(XRseq.64PP.plus.ori.df, XRseq.64PP.minus.ori.df)

# Plot

XRseq.CPD.ori.plot <- XRseq.CPD.ori.df %>% ggplot(aes(x = dist, y = value, class = strand)) +
  geom_line(aes(linetype = strand), color = "#0474B5") + xlim(-5000, 5000) +
  scale_linetype_manual(values=c("twodash", "solid")) +
  xlab("Distance from origin (bp)") + ylab("NER efficiency (normalised read count)") + ggtitle("CPD") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.ori.plot

XRseq.64PP.ori.plot <- XRseq.64PP.ori.df %>% ggplot(aes(x = dist, y = value, class = strand)) +
  geom_line(aes(linetype = strand), color = "#E41F1A") + xlim(-5000, 5000) +
  scale_linetype_manual(values=c("twodash", "solid")) +
  xlab("Distance from origin (bp)") + ylab("NER efficiency (normalised read count)") + ggtitle("(6-4)PP") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.64PP.ori.plot

pdf("./Rplots/XRseq.ori.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(XRseq.CPD.ori.plot, XRseq.64PP.ori.plot, nrow = 1, ncol = 2)
dev.off()
## quartz_off_screen 
##                 2

Plot the distribution of snvs at origins for cluster 1 genome for comparison

# r

# Recover origin-associated snvs in cluster 1
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
colnames(cluster.1.snvs.10kb.df) <- c("seqnames", "start", "end", "REF", "ALT", "seqnames.ori", "start.ori", "end.ori", "EFF", "ori.id", "ori.width", "dist")

# Compute snvs density in mutations per Mb (9,341 origins - 42 genomes)
cluster.1.snvs.dist.df <- cluster.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na() %>% group_by(dist) %>% summarise(snvs.count = n()) %>% mutate(snvs.dens = ((snvs.count/(9341*42))*10e6)/100)
saveRDS(cluster.1.snvs.dist.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.1.snvs.dist.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

cluster.1.snvs.dist.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.1.snvs.dist.df.rds")
cluster.1.snvs.dist.plot <- cluster.1.snvs.dist.df %>% ggplot(aes(x = dist, y = snvs.dens)) +
  geom_line(color = "#E41F1A") + xlim(-5000, 5000) +
  xlab("Distance from origin (bp)") + ylab("Mutations per Mb") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.snvs.dist.plot

pdf("./Rplots/cluster.1.snvs.dist.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.1.snvs.dist.plot
dev.off()
## quartz_off_screen 
##                 2

Assess the correlation between XR-seq signal and mutations at origins in cluster 1 genomes

# Recover origin-associated snvs in cluster 1
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
colnames(cluster.1.snvs.10kb.df) <- c("seqnames", "start", "end", "REF", "ALT", "seqnames.ori", "start.ori", "end.ori", "EFF", "ori.id", "ori.width", "dist")
cluster.1.snvs.ori.summary.df <- cluster.1.snvs.10kb.df %>% filter(dist >= -2000 & dist <= 2000) %>% group_by(ori.id) %>% summarise(snvs.count = n())

# Compute averaged XRseq values

# For CPD
XRseq.CPD.plus.ori.values <- rowMeans(as.data.frame(XRseq.CPD.plus.ori.mat)[,95:105], na.rm = T)
XRseq.CPD.minus.ori.values <- rowMeans(as.data.frame(XRseq.CPD.minus.ori.mat)[,95:105], na.rm = T)
XRseq.CPD.ori.df <- cbind.data.frame(ori.id = ori.gr$ori.id, CPD.plus = XRseq.CPD.plus.ori.values, CPD.minus = XRseq.CPD.minus.ori.values) %>% 
  mutate(CPD = CPD.plus + CPD.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% 
  mutate(bin = ntile(CPD, 20)) %>% group_by(bin) %>% summarise(read = mean(CPD, na.rm = T), snvs.mean = mean(snvs.count, na.rm = T), snvs.sem = std.error(snvs.count, na.rm = T)) %>% mutate(exp = "CPD")

# For (6-4)PP
XRseq.64PP.plus.ori.values <- rowMeans(as.data.frame(XRseq.64PP.plus.ori.mat)[,95:105], na.rm = T)
XRseq.64PP.minus.ori.values <- rowMeans(as.data.frame(XRseq.64PP.minus.ori.mat)[,95:105], na.rm = T)
XRseq.64PP.ori.df <- cbind.data.frame(ori.id = ori.gr$ori.id, PP.plus = XRseq.64PP.plus.ori.values, PP.minus = XRseq.64PP.minus.ori.values) %>% 
  mutate(PP = PP.plus + PP.minus + 1) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% 
  mutate(bin = ntile(PP, 20)) %>% group_by(bin) %>% summarise(read = mean(PP, na.rm = T), snvs.mean = mean(snvs.count, na.rm = T), snvs.sem = std.error(snvs.count, na.rm = T)) %>% mutate(exp = "64PP")

# Save data for plots
XRseq.CPD.64PP.ori.summary.df <- rbind(XRseq.CPD.ori.df, XRseq.64PP.ori.df)
saveRDS(XRseq.CPD.64PP.ori.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.64PP.ori.summary.df.rds")

# Compute stats

XRseq.CPD.ori.values.df <- cbind.data.frame(ori.id = ori.gr$ori.id, CPD.plus = XRseq.CPD.plus.ori.values, CPD.minus = XRseq.CPD.minus.ori.values) %>% 
  mutate(CPD = CPD.plus + CPD.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% mutate(log10.CPD = log10(CPD)) %>% filter(log10.CPD != "Inf" & log10.CPD != "-Inf")
cor.test(XRseq.CPD.ori.values.df$log10.CPD, XRseq.CPD.ori.values.df$snvs.count)  # Rho = -0.4411795, p-value < 2.2e-16

XRseq.64PP.ori.values.df <- cbind.data.frame(ori.id = ori.gr$ori.id, PP.plus = XRseq.64PP.plus.ori.values, PP.minus = XRseq.64PP.minus.ori.values) %>% 
  mutate(PP = PP.plus + PP.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% mutate(log10.64PP = log10(PP)) %>% filter(log10.64PP != "Inf" & log10.64PP != "-Inf")
cor.test(XRseq.64PP.ori.values.df$log10.64PP, XRseq.64PP.ori.values.df$snvs.count)  # Rho = -0.3659423, p-value < 2.2e-16
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

XRseq.CPD.64PP.ori.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.64PP.ori.summary.df.rds")
XRseq.CPD.64PP.ori.summary.plot <- XRseq.CPD.64PP.ori.summary.df %>%
  ggplot(aes(x=read, y=snvs.mean, color = exp)) +
  geom_line() +
  geom_errorbar(aes(ymin=snvs.mean-snvs.sem, ymax=snvs.mean+snvs.sem), width=.01, position=position_dodge(0.05)) +
  geom_point(shape = 21, size = 2, fill = "white") +
  scale_color_manual(values = c("#E41F1A", "#0474B5")) +
  scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  ylab("Mean mutations per origin") + xlab("CPD XR-seq, binned by read coverage") + ggtitle("Rho-CPD = -0.945, Rho-PP = -0.909, p-value < 2.2e-16") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.64PP.ori.summary.plot

pdf("./Rplots/XRseq.ori.snvs.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.CPD.64PP.ori.summary.plot
dev.off()
## quartz_off_screen 
##                 2

The previous experiments suggest a strand bias for NER activity at origins. We assess herein whether this due to dinucleotide biases or due to the single strandeness nature of the lagging strand.

XR-seq data have been generated using the MC-062 and 64M-2 antibodies for the CPD and 64PP data respectively. The MC-062 has been raised against thymidine dimers, then CPD XR-seq data are corrected for TT frequency. The 64M-2 antobodies have been raised against UV-C irradiated DNA, in this condition 64-PP adduct are formed at CC, CT and TC dinucleodites. The 64PP XR-seq is then corrected for the frequencies of these dinucleotides.

Relevant websites: - for CPD: https://www.kamiyabiomedical.com/cgi-bin/search.cgi?search=MC-062&language=english - for 64PP: https://www.cosmobiousa.com/products/anti-6-4pps-mab-clone-64m-2

Extract dinucleotide frequencies.

# r
# Extract dinucleotide frequency
# Load domain/bin coordinates
ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
ori.10kb.domain.100nt.split.gr <- ori.10kb.domain.100nt.split.gr[seqnames(ori.10kb.domain.100nt.split.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"
# Recover sequences and compute base composition statistics
ori.views <- Views(Hsapiens, ori.10kb.domain.100nt.split.gr)
# dinuc <- (expand.grid(c("A", "C", "G", "T"), c("A", "C", "G", "T")) %>% mutate(dinuc = paste0(Var1, Var2)))$dinuc
ori.dinuc <- dinucleotideFrequency(ori.views, step=1, as.prob = FALSE)
ori.dinuc.df <- cbind.data.frame(bin = ori.10kb.domain.100nt.split.gr$name, ori.dinuc)
ori.dinuc.summary <- ori.dinuc.df %>% group_by(bin) %>% summarise_at(colnames(ori.dinuc.df)[-1], sum) %>%
  mutate(dinuc.sum = rowSums(across(where(is.numeric)))) %>% 
  mutate(dist = (as.numeric(bin)-100)*100,
         TT.freq = TT/dinuc.sum,
         AA.freq = AA/dinuc.sum,
         CC.freq = CC/dinuc.sum,
         GG.freq = GG/dinuc.sum,
         CT.freq = CT/dinuc.sum,
         AG.freq = AG/dinuc.sum,
         TC.freq = TC/dinuc.sum,
         GA.freq = GA/dinuc.sum)
write.csv(ori.dinuc.summary, "./Dataset/ori/ori.dinuc.summary.csv", quote = F, row.names = F)

Correct XR-seq signal

#r

# Correction
# for CPD
XRseq.CPD.ori.corr.df <- XRseq.CPD.ori.df %>% left_join(ori.dinuc.summary, by = "dist") %>% 
  mutate(value.corr = case_when(strand == "plus" ~ value/TT.freq, T ~ value/AA.freq))
# for 64PP
XRseq.64PP.ori.corr.df <- XRseq.64PP.ori.df %>% left_join(ori.dinuc.summary, by = "dist") %>% 
  mutate(value.corr = case_when(strand == "plus" ~ value/(CC.freq+CT.freq+TC.freq), T ~ value/(GG.freq+AG.freq+GA.freq)))

# for CPD
XRseq.CPD.plus.df <- XRseq.CPD.ori.corr.df %>% filter(strand == "plus") %>% dplyr::select(dist, plus = value.corr)
XRseq.CPD.minus.df <- XRseq.CPD.ori.corr.df %>% filter(strand == "minus") %>% dplyr::select(dist, minus = value.corr)
XRseq.CPD.strand.df <- XRseq.CPD.plus.df %>% left_join(XRseq.CPD.minus.df, by = "dist") %>% mutate(bias = plus / minus, class ="CPD")
# for 64PP
XRseq.64PP.plus.df <- XRseq.64PP.ori.corr.df %>% filter(strand == "plus") %>% dplyr::select(dist, plus = value.corr)
XRseq.64PP.minus.df <- XRseq.64PP.ori.corr.df %>% filter(strand == "minus") %>% dplyr::select(dist, minus = value.corr)
XRseq.64PP.strand.df <- XRseq.64PP.plus.df %>% left_join(XRseq.64PP.minus.df, by = "dist") %>% mutate(bias = plus / minus, class ="64PP")
# Combine
XRseq.strand.summary.df <- rbind(XRseq.CPD.strand.df, XRseq.64PP.strand.df)
saveRDS(XRseq.strand.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.strand.summary.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
XRseq.strand.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.strand.summary.df.rds")

# Plot
XRseq.strand.summary.plot <- XRseq.strand.summary.df %>% ggplot(aes(x = dist, y = bias, color = class)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("#E41F1A", "#0474B5")) +
  xlab("Distance from origin (bp)") + ylab("NER strand bias (plus / minus)") +
  geom_hline(yintercept=1, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.strand.summary.plot

pdf("./Rplots/XRseq.strand.bias.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.strand.summary.plot
dev.off()
## quartz_off_screen 
##                 2

3.3 Cluster 1 SBS strand asymetry

XR-seq profiles at origins suggest that a strand asymetry for exposure to cluster 1 SBS

We assess this point by fitting the corresponding signature to pyrimidine and purine mutations separately

# Load cluster 1 snvs gr list
cluster.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")

# Split pyrimidine and purine mutations
cluster.1.pyr.grl <- cluster.1.snvs.grl
cluster.1.pur.grl <- cluster.1.snvs.grl
for (i in 1:length(cluster.1.snvs.grl)) {
  print(i/length(cluster.1.snvs.grl))
  gr.i <- cluster.1.snvs.grl[i]
  bin.i <- names(gr.i)
  unlist.gr.i <- unlist(gr.i)
  pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
  pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
    cluster.1.pyr.grl[i] <- pyr.gr.comp.i
  pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
  pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
    cluster.1.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.1.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.pyr.grl.rds")
saveRDS(cluster.1.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.pur.grl.rds")

# Compute mutational matrices
cluster.1.pyr.mat <- mut_matrix(vcf_list = cluster.1.pyr.grl, ref_genome = ref.genome)
cluster.1.pur.mat <- mut_matrix(vcf_list = cluster.1.pur.grl, ref_genome = ref.genome)

# Fit signatures
cluster.1.pyr.fit <- fit_to_signatures(cluster.1.pyr.mat, snvs.ori.mut.cluster)
cluster.1.pur.fit <- fit_to_signatures(cluster.1.pur.mat, snvs.ori.mut.cluster)

# Format
cluster.1.pyr.fit.df <- as.data.frame(cluster.1.pyr.fit$contribution)
  cluster.1.pyr.fit.df <- as.data.frame(t(cluster.1.pyr.fit.df)) %>% add_rownames(var = "dist") %>% select(dist, value = cluster.1) %>% mutate(strand = "plus")
cluster.1.pur.fit.df <- as.data.frame(cluster.1.pur.fit$contribution)
  cluster.1.pur.fit.df <- as.data.frame(t(cluster.1.pur.fit.df)) %>% add_rownames(var = "dist") %>% select(dist, value = cluster.1) %>% mutate(strand = "minus")
cluster.1.strand.fit.df <- rbind(cluster.1.pyr.fit.df, cluster.1.pur.fit.df) 
cluster.1.strand.fit.df$dist <- as.numeric(cluster.1.strand.fit.df$dist)
saveRDS(cluster.1.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.strand.fit.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

cluster.1.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.1.strand.fit.df.rds")
cluster.1.strand.fit.plot <- cluster.1.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("#0474B5", "#E41F1A")) +
  xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 1 contribution") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.strand.fit.plot

pdf("./Rplots/cluster.1.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.1.strand.fit.plot
dev.off()
## quartz_off_screen 
##                 2

3.4 NER efficiency at TFBS

ENCODE clustered transcription factor binding sites from UCSC http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/

# R

TF.data.df <- read.table("./Dataset/UCSC/encRegTfbsClusteredWithCells.hg19.bed.gz")
colnames(TF.data.df) <- c("seqnames", "start", "end", "TF", "col", "cell.line")

# Use data from GM23338, skin fibroblast

all.TF.GM23338.data.df <- TF.data.df %>%
  filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 123255 TFBS
all.TF.GM23338.data.gr <- makeGRangesFromDataFrame(all.TF.GM23338.data.df, keep.extra.columns = T)
unique(all.TF.GM23338.data.df$TF) # 5 TFs: CTCF, REST, EZH2, ETS1, NANOG

# Filter specific TFs

# We focus on ETS1 and CTCF

ETS1.GM23338.data.df <- TF.data.df %>%
  filter(TF == "ETS1") %>% 
  filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 8856 TFBS
ETS1.GM23338.data.gr <- makeGRangesFromDataFrame(ETS1.GM23338.data.df, keep.extra.columns = T)

CTCF.GM23338.data.df <- TF.data.df %>%
  filter(TF == "CTCF") %>% 
  filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 90599 TFBS
CTCF.GM23338.data.gr <- makeGRangesFromDataFrame(CTCF.GM23338.data.df, keep.extra.columns = T)

################################################################################
# Assess overlap with constitutive origins

ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed")
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.df <- ori.df %>% mutate(start = round(start-(ori.width/2)), end = round(start+(ori.width/2)))
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"
# 9341 origins

ori.TF.overlap <- findOverlaps(ori.gr, all.TF.GM23338.data.gr, minoverlap = 100L)
  ori.no.TF.id <- unique(ori.gr[-queryHits(ori.TF.overlap)])$ori.id # 4377
  ori.TF.id <- unique(ori.gr[queryHits(ori.TF.overlap)])$ori.id # 4964
  
ori.ETS1.overlap <- findOverlaps(ori.gr, ETS1.GM23338.data.gr, minoverlap = 100L)
  ori.no.ETS1.id <- unique(ori.gr[-queryHits(ori.ETS1.overlap)])$ori.id # 7596
  ori.ETS1.id <- unique(ori.gr[queryHits(ori.ETS1.overlap)])$ori.id # 1745

ori.CTCF.overlap <- findOverlaps(ori.gr, CTCF.GM23338.data.gr, minoverlap = 100L)
  ori.no.CTCF.id <- unique(ori.gr[-queryHits(ori.CTCF.overlap)])$ori.id # 6268
  ori.CTCF.id <- unique(ori.gr[queryHits(ori.CTCF.overlap)])$ori.id # 3073

############################################################################
# Assess NER activity at TFBS

# Compute TFBS midpoints

all.TF.GM23338.midpoints.df <- all.TF.GM23338.data.df %>%
  mutate(start = round((start+end)/2), end = start + 1)
all.TF.GM23338.midpoints.gr <- makeGRangesFromDataFrame(all.TF.GM23338.midpoints.df) # 123255 ranges

ETS1.GM23338.midpoints.df <- ETS1.GM23338.data.df %>%
  mutate(start = round((start+end)/2), end = start + 1)
ETS1.GM23338.midpoints.gr <- makeGRangesFromDataFrame(ETS1.GM23338.midpoints.df) # 8856 ranges

CTCF.GM23338.midpoints.df <- CTCF.GM23338.data.df %>%
  mutate(start = round((start+end)/2), end = start + 1)
CTCF.GM23338.midpoints.gr <- makeGRangesFromDataFrame(CTCF.GM23338.midpoints.df) # 90599 ranges

# Load XR-seq data

XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")

# Compute NER activity at TFBS clusters

# For CPD

XRseq.CPD.plus.all.TF.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, all.TF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.all.TF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.all.TF.mat.rds")

XRseq.CPD.minus.all.TF.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, all.TF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.all.TF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.all.TF.mat.rds")

XRseq.CPD.plus.ETS1.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, ETS1.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.ETS1.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ETS1.mat.rds")

XRseq.CPD.minus.ETS1.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, ETS1.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.ETS1.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ETS1.mat.rds")

XRseq.CPD.plus.CTCF.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, CTCF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.CTCF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.CTCF.mat.rds")

XRseq.CPD.minus.CTCF.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, CTCF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.CTCF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.CTCF.mat.rds")

# Combine information from plus and minus strand, prepare df for plotting

XRseq.CPD.all.TF.values <- colMeans(as.data.frame(XRseq.CPD.plus.all.TF.mat))+colMeans(as.data.frame(XRseq.CPD.minus.all.TF.mat))
XRseq.CPD.ETS1.values <- colMeans(as.data.frame(XRseq.CPD.plus.ETS1.mat))+colMeans(as.data.frame(XRseq.CPD.minus.ETS1.mat))
XRseq.CPD.CTCF.values <- colMeans(as.data.frame(XRseq.CPD.plus.CTCF.mat))+colMeans(as.data.frame(XRseq.CPD.minus.CTCF.mat))
XRseq.CPD.TF.df <- cbind.data.frame(dist = seq(-1000,1000,10), all.TF = XRseq.CPD.all.TF.values, ETS1 = XRseq.CPD.ETS1.values, CTCF = XRseq.CPD.CTCF.values) %>% 
  gather("overlap", "value", -dist)
saveRDS(XRseq.CPD.TF.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.TF.df.rds")

############################################################################
# Assess mutation rates at TFBS

# Load cluster 1 snvs coordinates

cluster.1.grl <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.grl.rds")
cluster.1.gr <- unlist(cluster.1.grl)

# Compute distances

snvs.all.TF.nearest <- nearest(cluster.1.gr, all.TF.GM23338.midpoints.gr)
snvs.all.TF.nearest[is.na(snvs.all.TF.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values  
snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
all.TF.pos <- all.TF.GM23338.midpoints.gr[snvs.all.TF.nearest]
snvs.all.TF.nearest.df <- cbind.data.frame(snvs.pos, all.TF.pos) %>% mutate(dist = snvs.pos - start) %>% 
  filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>% 
  group_by(dist) %>% summarise(all.TF = n())
plot(snvs.all.TF.nearest.df$dist, snvs.all.TF.nearest.df$all.TF, xlim = c(-500, 500), ylim = c(2100, 2900))

snvs.ETS1.nearest <- nearest(cluster.1.gr, ETS1.GM23338.midpoints.gr)
snvs.ETS1.nearest[is.na(snvs.ETS1.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values  
  snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
  ETS1.pos <- ETS1.GM23338.midpoints.gr[snvs.ETS1.nearest]
snvs.ETS1.nearest.df <- cbind.data.frame(snvs.pos, ETS1.pos) %>% mutate(dist = snvs.pos - start) %>% 
  filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>% 
  group_by(dist) %>% summarise(ETS1 = n())
plot(snvs.ETS1.nearest.df$dist, snvs.ETS1.nearest.df$ETS1, xlim = c(-500, 500), ylim = c(0, 400))

snvs.CTCF.nearest <- nearest(cluster.1.gr, CTCF.GM23338.midpoints.gr)
snvs.CTCF.nearest[is.na(snvs.CTCF.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values  
snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
CTCF.pos <- CTCF.GM23338.midpoints.gr[snvs.CTCF.nearest]
snvs.CTCF.nearest.df <- cbind.data.frame(snvs.pos, CTCF.pos) %>% mutate(dist = snvs.pos - start) %>% 
  filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>% 
  group_by(dist) %>% summarise(CTCF = n())
plot(snvs.CTCF.nearest.df$dist, snvs.CTCF.nearest.df$CTCF, xlim = c(-500, 500), ylim = c(1100, 2500))

snvs.count.TF.df <- snvs.all.TF.nearest.df %>% left_join(snvs.ETS1.nearest.df) %>% left_join(snvs.CTCF.nearest.df) %>% 
  gather("TF", "snvs.count", -dist)
saveRDS(snvs.count.TF.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.count.TF.df.rds")

##########################################################################################################################################################################################
# Compute NER efficiency at origins binned by potential overlap with TFBS

# For CPD/CSB data

XRseq.CSB.CPD.ori.mat <- readRDS(file =  "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")
XRseq.CSB.CPD.ori.df <- as.data.frame(XRseq.CSB.CPD.ori.mat)
XRseq.CSB.CPD.ori.df$ori.id <- ori.gr$ori.id

XRseq.CSB.CPD.no.TF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.no.TF.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.ETS1.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.ETS1.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.CTCF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.CTCF.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.all.TF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.TF.id) %>% dplyr::select(-ori.id)

# Normalise values to background
XRseq.TF.summary.ori.df <- cbind.data.frame(dist = seq(-5000,5000,100), no.TF = as.vector(colMeans(XRseq.CSB.CPD.no.TF.df, na.rm = T)), ETS1 = as.vector(colMeans(XRseq.CSB.CPD.ETS1.df, na.rm = T)), CTCF = as.vector(colMeans(XRseq.CSB.CPD.CTCF.df, na.rm = T)), all.TF = as.vector(colMeans(XRseq.CSB.CPD.all.TF.df, na.rm = T))) %>% 
  mutate(no.TF = no.TF/mean(no.TF[c(1:10,90:100)]), ETS1 = ETS1/mean(ETS1[c(1:10,90:100)]), CTCF = CTCF/mean(CTCF[c(1:10,90:100)]), all.TF = all.TF/mean(all.TF[c(1:10,90:100)])) %>% 
  gather("overlap", "value", -dist)
saveRDS(XRseq.TF.summary.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.TF.summary.ori.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# XRseq profile at TFSB

XRseq.CPD.TF.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.TF.df.rds")
XRseq.CPD.TF.plot <- XRseq.CPD.TF.df %>%
  ggplot(aes(x=dist, y=value, colour=overlap)) + geom_line() +
  scale_color_manual(values = c("#6D6E71", "#6FAB95", "#FABB65")) + xlim(-500, 500) +
  xlab("Distance from TFBS (bp)") + ylab("NER efficiency (normalised read count)") + 
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.TF.plot

# snvs count at TFBS

snvs.count.TF.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.count.TF.df.rds")
snvs.count.TF.plot <- snvs.count.TF.df %>%
  filter(TF == "all.TF") %>% 
  ggplot(aes(x=dist, y=snvs.count, colour=TF)) + geom_line(color = "#E41F1A") +
  xlim(-500, 500) + ylim(2100, 2900) +
  xlab("Distance from TFBS (bp)") + ylab("snvs count") + 
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
snvs.count.TF.plot

# Xrseq profile at origins with or without TFBS

XRseq.TF.summary.ori.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/XRseq.TF.summary.ori.df.rds")
XRseq.TF.summary.ori.df.plot <- XRseq.TF.summary.ori.df %>%
  filter(overlap != "all.TF") %>% 
  ggplot(aes(x=dist, y=value, colour=overlap)) + geom_line() +
  scale_color_manual(values = c("#6FAB95", "#FABB65", "#0775B6")) +
  xlab("Distance from origin (bp)") + ylab("NER efficiency (FC from background)") + 
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.TF.summary.ori.df.plot

# Prepare and save final plots

pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/TFBS.plots.pdf", width=12, height=4, useDingbats=FALSE)
ggarrange(XRseq.CPD.TF.plot, snvs.count.TF.plot, XRseq.TF.summary.ori.df.plot, nrow = 1)
dev.off()
## quartz_off_screen 
##                 2

3.5 NER efficiency at origins

Because NER can be coupled to transcription and that consitutive origins significantly overlap with genes, we assess the impact of origin position on XR-seq signal. We bin origins according to their genomic location (genic, promoter or intergenic).

# r
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

# Load origin center coordinates
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t",)
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"

# Recover protein coding gene id
hsapiens.coding.genes <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")

# Recover gene and promoter coordinates
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hsapiens.genes.gr <- genes(txdb)
hsapiens.coding.genes.gr <- hsapiens.genes.gr[hsapiens.genes.gr$gene_id %in% hsapiens.coding.genes$ncbi.gene.id]
hsapiens.coding.TSS.gr <- promoters(hsapiens.coding.genes.gr, upstream=500, downstream=500)

# Select origins overlapping TSS or genes
ori.genic.gr <- subsetByOverlaps(ori.gr, hsapiens.coding.genes.gr)
ori.TSS.gr <- subsetByOverlaps(ori.gr, hsapiens.coding.TSS.gr)

# Summarise
ori.df <- ori.df %>% mutate(class = case_when(ori.df$ori.id %in% ori.genic.gr$ori.id ~ "genic",
                                              ori.df$ori.id %in% ori.TSS.gr$ori.id ~ "promoter",
                                              T ~ "intergenic"))

# Compute NER efficiency at origins binned by genomic coordinates
# For CPD/CSB data

XRseq.CPD.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")
XRseq.CPD.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")

XRseq.CPD.plus.ori.mat.df <- as.data.frame(XRseq.CPD.plus.ori.mat)
XRseq.CPD.minus.ori.mat.df <- as.data.frame(XRseq.CPD.minus.ori.mat)

XRseq.CPD.plus.ori.mat.df$ori.id <- ori.gr$ori.id
XRseq.CPD.minus.ori.mat.df$ori.id <- ori.gr$ori.id

# Compute NER efficiency at origins binned by genomic coordinates
# For CPD/CSB data

XRseq.CSB.CPD.ori.mat <- readRDS(file =  "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")
XRseq.CSB.CPD.ori.df <- as.data.frame(XRseq.CSB.CPD.ori.mat)
XRseq.CSB.CPD.ori.df$ori.id <- ori.gr$ori.id

XRseq.CSB.CPD.inter.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "intergenic"))$ori.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.genic.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "genic"))$ori.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.promoter.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "promoter"))$ori.id) %>% dplyr::select(-ori.id)

# Normalise values to background
XRseq.CSB.CPD.summary.ori.df <- cbind.data.frame(dist = seq(-5000,5000,100), inter = as.vector(colMeans(XRseq.CSB.CPD.inter.ori.df, na.rm = T)), genic = as.vector(colMeans(XRseq.CSB.CPD.genic.ori.df, na.rm = T)), promoter = as.vector(colMeans(XRseq.CSB.CPD.promoter.ori.df, na.rm = T))) %>% 
  mutate(inter = inter/mean(inter[c(1:10,90:100)]), genic = genic/mean(genic[c(1:10,90:100)]), promoter = promoter/mean(promoter[c(1:10,90:100)])) %>% 
  gather("location", "value", -dist)
saveRDS(XRseq.CSB.CPD.summary.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.summary.ori.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

XRseq.CSB.CPD.summary.ori.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.summary.ori.df.rds")
XRseq.CSB.CPD.summary.ori.plot <- XRseq.CSB.CPD.summary.ori.df %>% 
  ggplot(aes(x=dist, y=value, colour=location)) + geom_line() +
  scale_color_manual(values = c("#4F4A75", "#0775B6", "#E4221E")) +
  xlab("Distance from origin (bp)") + ylab("NER efficiency (FC from background)") + 
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CSB.CPD.summary.ori.plot

pdf("./Rplots/XRseq.CSB.CPD.summary.ori.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.CSB.CPD.summary.ori.plot
dev.off()
## quartz_off_screen 
##                 2

4 Characterisition of cluster 2 SBS signature

Cluster 2 signature present an unusual profile with enrichment of mutation in the G[T>G]G context. In this section we aim to characterise further the context of these most abundant T>G mutations.

4.1 T>G mutation context

We first combine all T>G mutations at origins and flanks to extract a sequence logo

# r

# Assess expended mutational profile of cluster 2 SBS

# Select cluster 2 genomes

clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes

snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]
snvs.flanks.cluster.2.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.2.id]

# Extract contexts to select the G[T>G]G mutation characterising cluster 2 SBS

# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19

# At origins
snvs.ori.unlist.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 mutations
snvs.ori.mut.type <- mut_type(snvs.ori.unlist.gr)
snvs.ori.mut.context <- mut_context(snvs.ori.unlist.gr, ref_genome = ref.genome)
snvs.ori.mut.summary.df <- cbind.data.frame(seqnames = seqnames(snvs.ori.unlist.gr),
                                            start = start(ranges(snvs.ori.unlist.gr)),
                                            end = end(ranges(snvs.ori.unlist.gr)),
                                            REF = snvs.ori.unlist.gr$REF,
                                            mut.type = snvs.ori.mut.type,
                                            mut.context = snvs.ori.mut.context)
snvs.ori.mut.G.TG.G.df <- snvs.ori.mut.summary.df %>% filter(mut.type == "T>G") %>%
  mutate(strand = case_when(REF == "T" ~ "+", T ~ "-")) # 929 snvs
snvs.ori.mut.G.TG.G.gr <- makeGRangesFromDataFrame(snvs.ori.mut.G.TG.G.df, keep.extra.columns = T) + 10
snvs.ori.mut.G.TG.G.seq <- getSeq(Hsapiens, snvs.ori.mut.G.TG.G.gr)
# Build sequence logo
snvs.ori.mut.G.TG.G.seq.list <- as.vector(snvs.ori.mut.G.TG.G.seq)
saveRDS(snvs.ori.mut.G.TG.G.seq.list, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.ori.mut.G.TG.G.seq.list.rds")

# At flanks
snvs.flanks.unlist.gr <- unlist(snvs.flanks.cluster.2.grl) # 34,173 mutations
snvs.flanks.mut.type <- mut_type(snvs.flanks.unlist.gr)
snvs.flanks.mut.context <- mut_context(snvs.flanks.unlist.gr, ref_genome = ref.genome)
snvs.flanks.mut.summary.df <- cbind.data.frame(seqnames = seqnames(snvs.flanks.unlist.gr),
                                            start = start(ranges(snvs.flanks.unlist.gr)),
                                            end = end(ranges(snvs.flanks.unlist.gr)),
                                            REF = snvs.flanks.unlist.gr$REF,
                                            mut.type = snvs.flanks.mut.type,
                                            mut.context = snvs.flanks.mut.context)
snvs.flanks.mut.G.TG.G.df <- snvs.flanks.mut.summary.df %>% filter(mut.type == "T>G") %>%
  mutate(strand = case_when(REF == "T" ~ "+", T ~ "-")) # 7,098 snvs
snvs.flanks.mut.G.TG.G.gr <- makeGRangesFromDataFrame(snvs.flanks.mut.G.TG.G.df, keep.extra.columns = T) + 10
snvs.flanks.mut.G.TG.G.seq <- getSeq(Hsapiens, snvs.flanks.mut.G.TG.G.gr)
# Build sequence list for logo
snvs.flanks.mut.G.TG.G.seq.list <- as.vector(snvs.flanks.mut.G.TG.G.seq)
saveRDS(snvs.flanks.mut.G.TG.G.seq.list, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.flanks.mut.G.TG.G.seq.list.rds")

Plot DNA logos

library(dplyr)
library(ggplot2)
library(ggseqlogo)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
snvs.ori.mut.G.TG.G.seq.list <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.ori.mut.G.TG.G.seq.list.rds")
snvs.flanks.mut.G.TG.G.seq.list <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.flanks.mut.G.TG.G.seq.list.rds")

logo.col <- make_col_scheme(chars=c('A', 'T', 'C', 'G'),cols=c('#ADCC55', '#DF1F18', '#33BAED', '#E6A004'))
logo.cluster.2.ori <-ggplot() + geom_logo(snvs.ori.mut.G.TG.G.seq.list, col_scheme=logo.col) + theme_logo() + theme(aspect.ratio=0.25) + ggtitle("origins")
logo.cluster.2.flanks <- ggplot() + geom_logo(snvs.flanks.mut.G.TG.G.seq.list, col_scheme=logo.col) + theme_logo() + theme_logo() + theme(aspect.ratio=0.25) + ggtitle("flanks")
ggarrange(logo.cluster.2.ori, logo.cluster.2.flanks, nrow = 2)

pdf("./Rplots/cluster.2.DNA.logo.pdf", width=5, height=5, useDingbats=FALSE)
ggarrange(logo.cluster.2.ori, logo.cluster.2.flanks, nrow = 2)
dev.off()
## quartz_off_screen 
##                 2

Extended T>G mutation context is analysed using a river plot

# r/SLURM

# We consider 5nt around T>G mutations context

snvs.ori.cluster.2.mut.mat.ext.context <- mut_matrix(snvs.ori.unlist.gr[start(ranges(snvs.ori.unlist.gr)) %in% snvs.ori.mut.G.TG.G.df$start], ref_genome = ref.genome, extension = 5)
colnames(snvs.ori.cluster.2.mut.mat.ext.context) <- c("ori")
snvs.flanks.cluster.2.mut.mat.ext.context <- mut_matrix(snvs.flanks.unlist.gr[start(ranges(snvs.flanks.unlist.gr)) %in% snvs.flanks.mut.G.TG.G.df$start], ref_genome = ref.genome, extension = 5)
colnames(snvs.flanks.cluster.2.mut.mat.ext.context) <- c("flanks")
snvs.cluster.2.mut.mat.ext.context <- cbind(snvs.ori.cluster.2.mut.mat.ext.context, snvs.flanks.cluster.2.mut.mat.ext.context)
saveRDS(snvs.cluster.2.mut.mat.ext.context, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.2.mut.mat.ext.context.rds")

# As the generation of the river plot uses a lot of memory, the plot is generated externally 

pdf("./Rplots/cluster.2.extended.profile.5nt.pdf", width=8, height=6, useDingbats=FALSE)
plot_river(snvs.cluster.2.mut.mat.ext.context) + theme(aspect.ratio=0.3)
dev.off()
C>T extended context

C>T extended context

Both DNA logos and the river plot suggest that T>G mutations occur within G-quadruplex (G4) structures.

4.2 Cluster 2 cancer sample specific signatures

# R

# Compute cancer type specific signature from tumour in cluster 2

# Load cluster information
clusters.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load mutational matrix
snvs.diff.mut.mat <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Sum mutation counts per clusters
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id
# Add cancer type information
snvs.dens.ratio.df <- readRDS("~/Desktop/Projects/OriCan/001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
sample.info.df <- snvs.dens.ratio.df %>% dplyr::select(donor, cancer.type, cancer.project, Primary.Site)
cluster.2.sample.info.df <- sample.info.df %>% filter(donor %in% cluster.2.id)
unique(cluster.2.sample.info.df$cancer.type)
# "PACA" "ESAD" "MELA" "PBCA" "OV"   "BOCA" "CMDI" "BRCA"
# Compute mutational signatures for selected cancer type
# PACA
cluster.2.PACA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "PACA")
cluster.2.PACA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.PACA.id$donor)) %>% mutate(PACA = rowSums(.)) %>% dplyr::select(PACA)
# ESAD
cluster.2.ESAD.id <- cluster.2.sample.info.df %>% filter(cancer.type == "ESAD")
cluster.2.ESAD.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.ESAD.id$donor)) %>% mutate(ESAD = rowSums(.)) %>% dplyr::select(ESAD)
# MELA
cluster.2.MELA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "MELA")
cluster.2.MELA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.MELA.id$donor)) %>% mutate(MELA = rowSums(.)) %>% dplyr::select(MELA)
# PBCA
cluster.2.PBCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "PBCA")
cluster.2.PBCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.PBCA.id$donor)) %>% mutate(PBCA = rowSums(.)) %>% dplyr::select(PBCA)
# OV
cluster.2.OV.id <- cluster.2.sample.info.df %>% filter(cancer.type == "OV")
cluster.2.OV.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.OV.id$donor)) %>% mutate(OV = rowSums(.)) %>% dplyr::select(OV)
# BOCA
cluster.2.BOCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "BOCA")
cluster.2.BOCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.BOCA.id$donor)) %>% mutate(BOCA = rowSums(.)) %>% dplyr::select(BOCA)
# CMDI
cluster.2.CMDI.id <- cluster.2.sample.info.df %>% filter(cancer.type == "CMDI")
cluster.2.CMDI.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.CMDI.id$donor)) %>% mutate(CMDI = rowSums(.)) %>% dplyr::select(CMDI)
# BRCA
cluster.2.BRCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "BRCA")
cluster.2.BRCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.BRCA.id$donor)) %>% mutate(BRCA = rowSums(.)) %>% dplyr::select(BRCA)
# Combine all signatures
cluster.2.cancer.type.mut <- cbind.data.frame(cluster.2.PACA.mut, cluster.2.ESAD.mut, cluster.2.MELA.mut,
                                              cluster.2.PBCA.mut, cluster.2.OV.mut, cluster.2.BOCA.mut,
                                              cluster.2.CMDI.mut, cluster.2.BRCA.mut)
saveRDS(cluster.2.cancer.type.mut, file = "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster.2.cancer.type.mut.rds")

Plot profile

library(dplyr)
library(ggplot2)
library(MutationalPatterns)

# Load data
cluster.2.cancer.type.mut <- readRDS(file = "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster.2.cancer.type.mut.rds")

plot_96_profile(cluster.2.cancer.type.mut, condensed = TRUE, ymax = 0.45)

pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/cluster.2.cancer.type.signatures.pdf", width=5, height=8, useDingbats=FALSE)
plot_96_profile(cluster.2.cancer.type.mut, condensed = TRUE, ymax = 0.45)
dev.off() 
## quartz_off_screen 
##                 2

4.3 G-quadruplex occurences at mutation sites

In this section we assess the contribution of G4s to cluster 2 SBS. To quantify the ability of sequences to fold into G4 structures, we use the G4Hunter algortithm (Bedrat et al. NAR 2016, https://academic.oup.com/nar/article/44/4/1746/1854457)

# Compute G4Hunter score of sequences encompassing T>G mutations (+- 25nt)

library(S4Vectors)
library(Biostrings)
source("./Scripts/scoreG4hunt.R")

# Compute G4H scores for T>G mutations at origins
snvs.ori.mut.G.TG.G.ext.gr <- makeGRangesFromDataFrame(snvs.ori.mut.G.TG.G.df, keep.extra.columns = T) + 25
snvs.ori.mut.G.TG.G.ext.seq <- getSeq(Hsapiens, snvs.ori.mut.G.TG.G.ext.gr)

snvs.ori.GT.G4H <- vector()
for (i in 1:length(snvs.ori.mut.G.TG.G.ext.seq)) {
  print(i/length(snvs.ori.mut.G.TG.G.ext.seq))
  seq.i <- snvs.ori.mut.G.TG.G.ext.seq[i]
  G4H.i <- scoreG4hunt(seq.i)
  snvs.ori.GT.G4H <- c(snvs.ori.GT.G4H, G4H.i)
}
snvs.ori.GT.G4H.df <- cbind.data.frame(data = "ori", G4H.score = snvs.ori.GT.G4H)

# Compute G4H scores for T>G mutations at origin flanks
snvs.flanks.mut.G.TG.G.ext.gr <- makeGRangesFromDataFrame(snvs.flanks.mut.G.TG.G.df, keep.extra.columns = T) + 25
snvs.flanks.mut.G.TG.G.ext.seq <- getSeq(Hsapiens, snvs.flanks.mut.G.TG.G.ext.gr)

snvs.flanks.GT.G4H <- vector()
for (i in 1:length(snvs.flanks.mut.G.TG.G.ext.seq)) {
  print(i/length(snvs.flanks.mut.G.TG.G.ext.seq))
  seq.i <- snvs.flanks.mut.G.TG.G.ext.seq[i]
  G4H.i <- scoreG4hunt(seq.i)
  snvs.flanks.GT.G4H <- c(snvs.flanks.GT.G4H, G4H.i)
}
snvs.flanks.GT.G4H.df <- cbind.data.frame(data = "flanks", G4H.score = snvs.flanks.GT.G4H)

# Compute G4H scores for all Ts at origins (center +- 500bp)

# Recover origin domain coordinates
ori.1kb.df <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.1kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.1kb.gr <- makeGRangesFromDataFrame(ori.1kb.df)
my.chr <- c(1:22, "X", "Y")
ori.1kb.gr <- ori.1kb.gr[seqnames(ori.1kb.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.gr) <- "UCSC"
# Recover origin sequences
ori.1kb.seq <- getSeq(Hsapiens, ori.1kb.gr)
# Comute G4H score of all N25TN25 and N25AN25 sequences
library(stringr)
G4H.all.ori.Ts <- vector()
for(i in 1:length(ori.1kb.seq)) {
  print(i/length(ori.1kb.seq))
  seq.i <- ori.1kb.seq[i]
  T.seq.i <- str_extract_all(as.character(seq.i), pattern = "[A|C|G|T]{25}T[A|C|G|T]{25}")
  A.seq.i <- str_extract_all(as.character(seq.i), pattern = "[A|C|G|T]{25}A[A|C|G|T]{25}")
  AT.seq.i <- c(T.seq.i[[1]], A.seq.i[[1]])
  G4H.i <- vector()
  for (j in 1 :length(AT.seq.i)) {
    seq.j <- AT.seq.i[j]
    G4H.j <- scoreG4hunt(seq.j)
    G4H.i <- c(G4H.i, G4H.j)
  }
  G4H.all.ori.Ts <- c(G4H.all.ori.Ts, G4H.i)
}
G4H.all.ori.Ts.df <- cbind.data.frame(data = "all.Ts", G4H.score = G4H.all.ori.Ts)

# Combine information
snvs.GT.G4H.summary.df <- rbind(snvs.ori.GT.G4H.df, snvs.flanks.GT.G4H.df, G4H.all.ori.Ts.df)
saveRDS(snvs.GT.G4H.summary.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.GT.G4H.summary.df.rds")

# Compute stats
ks.test(snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "ori"),]$G4H.score, snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "flanks"),]$G4H.score) # p-value < 2.2e-16
ks.test(snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "ori"),]$G4H.score, snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "all.Ts"),]$G4H.score) # p-value < 2.2e-16
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load result
snvs.GT.G4H.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.GT.G4H.summary.df.rds")

# Plot
snvs.GT.G4H.summary.plot <- snvs.GT.G4H.summary.df %>% 
  ggplot(aes(x=data, y=G4H.score, alpha = 0.3)) +
  geom_boxplot(width = 0.5, fill="#D56114") + ylim(-3,5) +
  geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.GT.G4H.summary.plot

pdf("./Rplots/snvs.GT.G4H.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
snvs.GT.G4H.summary.plot
dev.off()
## quartz_off_screen 
##                 2

4.4 Association between G-quadruplex formation and mutational burden

This section aims to assess the correlation between G-quadruplex formation and mutational burden at origins. To do so we recover all G-quadruplex forming sequences (of the for N5G3+N1-12G3+N1-12G3+N1-12G3+N5), assess their propensity to fold into G-quadruplexes (G4H score) and their mutability (T>G mutations).

Compute G4 propensity

# r

# Recover all G4 forming sequences at origins and compute their associated G4H scores

ori.1kb.df <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.1kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.1kb.gr <- makeGRangesFromDataFrame(ori.1kb.df)
my.chr <- c(1:22, "X", "Y")
ori.1kb.gr <- ori.1kb.gr[seqnames(ori.1kb.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.gr) <- "UCSC"
# Recover origin sequences
ori.1kb.seq <- getSeq(Hsapiens, ori.1kb.gr)

G4.score.ori.df <- tibble()
for (i in 1:length(ori.1kb.seq)) {
  print(i/length(ori.1kb.seq))
    seq.i <- as.character(ori.1kb.seq[i])
    seqnames.i <- seqnames(ori.1kb.gr)[i]
    ranges.i <- ranges(ori.1kb.gr)[i]
    # Compute score of G4 motif on top strand
    G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
    G4.seq.length.i <- nchar(G4.seq.i)
    if (length(G4.seq.i) > 0) {
      G4H.score.i <- vector()
      for (j in 1:length(G4.seq.i)) {
        G4.seq.j <- DNAStringSet(G4.seq.i)[j]
        G4H.score.j <- scoreG4hunt(G4.seq.j)
        G4H.score.i <- c(G4H.score.i, G4H.score.j)
      }  
      G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
      G4.seq.start.i <- G4.seq.pos.i[,1]
      G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
      G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
      G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
      G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i, class = "G4", seq = G4.seq.i, score = G4H.score.i)
    } else {G4.summary.i <- tibble()} 
    # Compute score of G4 motif on bottom strand      
    C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
    C4.seq.length.i <- nchar(C4.seq.i)
    if (length(C4.seq.i) > 0) {
      C4H.score.i <- vector()
      for (j in 1:length(C4.seq.i)) {
        C4.seq.j <- DNAStringSet(C4.seq.i)[j]
        C4H.score.j <- scoreG4hunt(reverseComplement(C4.seq.j))
        C4H.score.i <- c(C4H.score.i, C4H.score.j)
      }
      C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
      C4.seq.start.i <- C4.seq.pos.i[,1]
      C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
      C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
      C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
      C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i, class = "C4", seq = C4.seq.i, score = C4H.score.i)
    } else {C4.summary.i <- tibble()}    
  G4.score.ori.df <- rbind(G4.score.ori.df, G4.summary.i, C4.summary.i)  
}
saveRDS(G4.score.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")

# Recover all G4 forming sequences at origin flanks and compute their associated G4H scores

ori.10kb.df <- read.table("./Dataset/ori/ori.10kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.10kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.10kb.gr <- makeGRangesFromDataFrame(ori.10kb.df)
my.chr <- c(1:22, "X", "Y")
ori.10kb.gr <- ori.10kb.gr[seqnames(ori.10kb.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.gr) <- "UCSC"
# Recover origin sequences
ori.10kb.seq <- getSeq(Hsapiens, ori.10kb.gr)

G4.score.flanks.df <- tibble()
for (i in 1:length(ori.10kb.seq)) {
  print(i/length(ori.10kb.seq))
  seq.i <- as.character(ori.10kb.seq[i])
  seqnames.i <- seqnames(ori.1kb.gr)[i]
  ranges.i <- ranges(ori.1kb.gr)[i]
  # Compute score of G4 motif on top strand
  G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
  G4.seq.length.i <- nchar(G4.seq.i)
  if (length(G4.seq.i) > 0) {
    G4H.score.i <- vector()
    for (j in 1:length(G4.seq.i)) {
      G4.seq.j <- DNAStringSet(G4.seq.i)[j]
      G4H.score.j <- scoreG4hunt(G4.seq.j)
      G4H.score.i <- c(G4H.score.i, G4H.score.j)
    }  
    G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
    G4.seq.start.i <- G4.seq.pos.i[,1]
    G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
    G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
    G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
    G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i, class = "G4", seq = G4.seq.i, score = G4H.score.i)
  } else {G4.summary.i <- tibble()} 
  # Compute score of G4 motif on bottom strand      
  C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
  C4.seq.length.i <- nchar(C4.seq.i)
  if (length(C4.seq.i) > 0) {
    C4H.score.i <- vector()
    for (j in 1:length(C4.seq.i)) {
      C4.seq.j <- DNAStringSet(C4.seq.i)[j]
      C4H.score.j <- scoreG4hunt(reverseComplement(C4.seq.j))
      C4H.score.i <- c(C4H.score.i, C4H.score.j)
    }
    C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
    C4.seq.start.i <- C4.seq.pos.i[,1]
    C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
    C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
    C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
    C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i, class = "C4", seq = C4.seq.i, score = C4H.score.i)
  } else {C4.summary.i <- tibble()}    
  G4.score.flanks.df <- rbind(G4.score.flanks.df, G4.summary.i, C4.summary.i)  
}
# Filter out G4 at origins
G4.score.flanks.gr <- makeGRangesFromDataFrame(G4.score.flanks.df, keep.extra.columns = T) # 181,147 G4s
G4.overlap <- findOverlaps(G4.score.flanks.gr, G4.score.ori.gr)
G4.score.flanks.filter.gr <- G4.score.flanks.gr[-queryHits(G4.overlap)] # 179,962 G4s
saveRDS(G4.score.flanks.filter.gr, file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.flanks.filter.gr.rds")

Subset mutations overlapping G4 forming sequences

# r

# Load G4 information
G4.score.ori.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")
G4.score.ori.gr <- makeGRangesFromDataFrame(G4.score.ori.df, keep.extra.columns = T) # 69,914 G4
G4.score.flanks.filter.gr <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.flanks.filter.gr.rds")

# Load cluster 2 snvs information
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]

# Format origins cluster 2 snvs
snvs.ori.cluster.2.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 mutations
snvs.ori.cluster.2.gr$mut <- paste(as.character(snvs.ori.cluster.2.gr$REF), as.character(unlist(snvs.ori.cluster.2.gr$ALT)), sep = ">")
ori.mut <- snvs.ori.cluster.2.gr$mut
ori.MUT <- case_when(ori.mut == "A>C" ~ "T>G",
                        ori.mut == "A>G" ~ "T>C",
                        ori.mut == "A>T" ~ "T>A",
                        ori.mut == "G>A" ~ "C>T",
                        ori.mut == "G>C" ~ "C>G",
                        ori.mut == "G>T" ~ "C>A",
                        T ~ ori.mut)
snvs.ori.cluster.2.gr$MUT <- ori.MUT
snvs.ori.cluster.2.TG.gr <- snvs.ori.cluster.2.gr[snvs.ori.cluster.2.gr$mut == "T>G"] # 498 snvs
snvs.ori.cluster.2.AC.gr <- snvs.ori.cluster.2.gr[snvs.ori.cluster.2.gr$mut == "A>C"] # 431 snvs

# Format flanks cluster 2 snvs
snvs.flanks.cluster.2.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.2.id]
snvs.flanks.cluster.2.gr <- unlist(snvs.flanks.cluster.2.grl)
snvs.flanks.cluster.2.gr$mut <- paste(as.character(snvs.flanks.cluster.2.gr$REF), as.character(unlist(snvs.flanks.cluster.2.gr$ALT)), sep = ">")
flanks.mut <- snvs.flanks.cluster.2.gr$mut
flanks.MUT <- case_when(flanks.mut == "A>C" ~ "T>G",
                        flanks.mut == "A>G" ~ "T>C",
                        flanks.mut == "A>T" ~ "T>A",
                        flanks.mut == "G>A" ~ "C>T",
                        flanks.mut == "G>C" ~ "C>G",
                        flanks.mut == "G>T" ~ "C>A",
                                          T ~ flanks.mut)
snvs.flanks.cluster.2.gr$MUT <- flanks.MUT

# Subset G4 at origins with T>G mutations
snvs.ori.cluster.2.TG.G4.gr <- subsetByOverlaps(G4.score.ori.gr[G4.score.ori.gr$class == "G4"], snvs.ori.cluster.2.TG.gr) # 670 G4s
snvs.ori.cluster.2.AC.G4.gr <- subsetByOverlaps(G4.score.ori.gr[G4.score.ori.gr$class == "C4"], snvs.ori.cluster.2.AC.gr) # 588 C4s
snvs.ori.cluster.2.mut.G4.gr <- c(snvs.ori.cluster.2.TG.G4.gr, snvs.ori.cluster.2.AC.G4.gr) # 1258 G4s

# Compare G4Hscore of mutated and non-mutated G4s at origins
G4.ori.summary.df <- cbind.data.frame(G4H.score = G4.score.ori.gr$score, class = "all.G4s")
G4.ori.mut.summary.df <- cbind.data.frame(G4H.score = snvs.ori.cluster.2.mut.G4.gr$score, class = "mut.G4s")
G4.ori.score.summary.df <- rbind(G4.ori.summary.df, G4.ori.mut.summary.df) # 71,172 origins
# Compute stats
ks.test(G4.ori.summary.df$G4H.score, G4.ori.mut.summary.df$G4H.score) # p-value < 2.2e-16
saveRDS(G4.ori.score.summary.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.summary.df.rds")

# Compare G4Hscore of G4s at origins or origin flanks
G4.ori.score.df <- cbind.data.frame(class = "ori", G4H.score = G4.ori.dist.gr$score)
G4.flanks.score.df <- cbind.data.frame(class = "flanks", G4H.score = G4.flanks.dist.gr$score)
G4.ori.flanks.summary <- rbind(G4.ori.score.df, G4.flanks.score.df) 
# Compute stats
ks.test(G4.ori.score.df$G4H.score, G4.flanks.score.df$G4H.score, alternative = "greater") # p-value = 0.09904
saveRDS(G4.ori.flanks.summary, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.flanks.summary.rds")

# Subset mutations overlapping G4s at origins or flanks
snvs.G4.ori.cluster.2.gr <- subsetByOverlaps(snvs.ori.cluster.2.gr, G4.score.ori.gr) # 984 snvs
snvs.G4.flanks.cluster.2.gr <- subsetByOverlaps(snvs.flanks.cluster.2.gr, G4.score.flanks.filter.gr) # 533 G4s

# Compare mutation types at origins and flanks
# Prepare mutation frequency table
snvs.ori.table <- as.data.frame(table(snvs.G4.ori.cluster.2.gr$MUT))
  colnames(snvs.ori.table) <- c("mut", "count")
  snvs.ori.table <- snvs.ori.table %>% mutate(freq = count/sum(count), class = "ori")
snvs.flanks.table <- as.data.frame(table(snvs.G4.flanks.cluster.2.gr$MUT))
  colnames(snvs.flanks.table) <- c("mut", "count")
  snvs.flanks.table <- snvs.flanks.table %>% mutate(freq = count/sum(count), class = "flanks")
snvs.table.df <- rbind(snvs.ori.table, snvs.flanks.table)
saveRDS(snvs.table.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.table.df.rds")

Plot results.

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load results
G4.ori.score.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.summary.df.rds")
G4.ori.flanks.summary <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.flanks.summary.rds")
snvs.table.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.table.df.rds")

# Plot 1

G4.ori.score.summary.plot <- G4.ori.score.summary.df %>% 
  ggplot(aes(x=class, y=G4H.score, alpha = 0.3)) +
  geom_boxplot(width = 0.5, fill="#D56114") + ylim(-0.5,4.5) +
  geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.ori.score.summary.plot

pdf("./Rplots/G4.ori.score.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
G4.ori.score.summary.plot
dev.off()
## quartz_off_screen 
##                 2
# Plot 2

G4.ori.flanks.summary.plot <- G4.ori.flanks.summary %>% 
  ggplot(aes(x=class, y=G4H.score, alpha = 0.3)) +
  geom_boxplot(width = 0.5, fill="#D56114") + ylim(-0.5,4.5) +
  geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.ori.flanks.summary.plot

pdf("./Rplots/G4.ori.flanks.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
G4.ori.flanks.summary.plot
dev.off()
## quartz_off_screen 
##                 2
# Plot 3

snvs.table.plot <- snvs.table.df %>% ggplot(aes(fill=mut, y=freq, x=class)) + 
  geom_bar(position="stack", stat="identity") + ylab("Frequency") +
  scale_fill_manual(values = c("#33BAED", "#010101", "#DF1F18", "#D4D2D2", "#ADCC55", "#F1D1CF")) +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.table.plot

# Compute odds ratio and associated p-value
fisher.df <- data.frame(
  "yes" = c(484, 124),
  "no" = c(500, 409),
  row.names = c("ori", "flanks"),
  stringsAsFactors = FALSE
)
colnames(fisher.df) <- c("T>G", "others")
test <- fisher.test(fisher.df) # Odds ratio 3.19, pval = 1.366678e-23
test$estimate
## odds ratio 
##   3.190414
test$p.value
## [1] 1.366678e-23
pdf("./Rplots/snvs.table.plot.pdf", width=7, height=6, useDingbats=FALSE)
snvs.table.plot
dev.off()
## quartz_off_screen 
##                 2

4.5 Mutational burden at experimentally mapped G4s

# R

library(S4Vectors)
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg19)
source("./Scripts/scoreG4hunt.R")
source("./Scripts/G4_classification.R")

# BED files were shared by the authors of the original report
# Esnault et al. Nature Genetics volume 55, pages 1359–1369 (2023)

# We also used the authors script to extract the max G4H score of peak sequences

# Load G4access peak coordinates

G4access.HaCaT.hg19.df <- read.table("./Dataset/G4access/HaCaT_hg19_G4access.bed") %>% mutate(V5 = "HaCaT")
colnames(G4access.HaCaT.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")

G4access.K562.hg19.df <- read.table("./Dataset/G4access/K562_hg19_G4access.bed") %>% mutate(V5 = "K562")
colnames(G4access.K562.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")

G4access.Raji.hg19.df <- read.table("./Dataset/G4access/Raji_hg19_G4access.bed") %>% mutate(V5 = "Raji")
colnames(G4access.Raji.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")

# Combine and liftover
G4access.hg19.df <- rbind(G4access.HaCaT.hg19.df, G4access.K562.hg19.df, G4access.Raji.hg19.df)
G4access.hg19.gr <- makeGRangesFromDataFrame(G4access.hg19.df, keep.extra.columns = T) # 68192 peaks

hist(log10(width(G4access.hg19.gr)), breaks = 100)
# bimodal distribution with separation at 120 nt

# Recover G4 peaks at origins domain

ori.1kb.domain.bed <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed")
colnames(ori.1kb.domain.bed) <- c("seqnames", "start", "end", "EFF", "ori.id")
ori.1kb.domain.gr <- makeGRangesFromDataFrame(ori.1kb.domain.bed, keep.extra.columns = T)
my.chr <- c(1:22, "X", "Y")
ori.1kb.domain.gr <- ori.1kb.domain.gr[seqnames(ori.1kb.domain.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.domain.gr) <- "UCSC"

G4access.ori.gr <- subsetByOverlaps(G4access.hg19.gr, ori.1kb.domain.gr) # 12,809 peaks

# Add peak.id

peak.id <- paste0("G4access_", seq(1,length(G4access.ori.gr),1))
G4access.ori.gr$peak.id <- peak.id

# Recover peak sequences

G4access.ori.seq <- getSeq(Hsapiens, G4access.ori.gr)
names(G4access.ori.seq) <- G4access.ori.gr$peak.id

# Compute max G4H score and recover associated sequence - a rolling window of 25 nt is used

library(data.table)
library(parallel)

# Parallelisable inner function to compute G4H subsequence results
compute_subseq <- function(i, seq.j, name.j) {
  subseq.i <- subseq(seq.j, i, i + 24)
  G4H.subseq.i <- scoreG4hunt(subseq.i)
  
  # Return a data.table with the relevant information
  return(data.table(
    peak.id = name.j,
    G4.pos = i,
    G4.seq = as.character(subseq.i),
    G4H = G4H.subseq.i,
    abs = G4H.subseq.i^2
  ))
}

# Main function to loop over sequences and compute max G4H for each
G4H.subseq.G4H.max.df <- data.table()

# Detect number of available cores
num_cores <- detectCores() - 1

# Loop through each sequence
for (j in seq_along(G4access.ori.seq)) {
  print(j / length(G4access.ori.seq))
  
  seq.j <- G4access.ori.seq[j]
  name.j <- names(G4access.ori.seq)[j]
  seq_width <- width(seq.j) - 25
  
  # Use parallel computation to process subsequences
  subseq_results <- mclapply(1:seq_width, compute_subseq, seq.j = seq.j, name.j = name.j, mc.cores = num_cores)
  
  # Combine results into a single data.table
  subseq_results <- rbindlist(subseq_results)
  
  # Find the max G4H for this sequence and append to the final result
  max_result <- subseq_results[which.max(abs), .(peak.id, G4.pos, G4.seq, G4H)]
  G4H.subseq.G4H.max.df <- rbind(G4H.subseq.G4H.max.df, max_result)
}
saveRDS(G4H.subseq.G4H.max.df, "./002_Mut_signatures_Pan_cancer/rds/G4access.subseq.G4H.max.df.rds")

# Add G4H score to peak information

G4access.ori.G4H.gr <- as.data.frame(G4access.ori.gr) %>% left_join(G4H.subseq.G4H.max.df, by = "peak.id") %>% mutate(start = start + G4.pos, end = start + 25) %>% dplyr::select(-G4.pos) %>% GRanges()
saveRDS(G4access.ori.G4H.gr, "./002_Mut_signatures_Pan_cancer/rds/G4access.ori.G4H.gr.rds")

# Identify mutated G4 sequences

# Load cluster 2 snvs information
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]
snvs.ori.cluster.2.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 snvs 

G4access.ori.mutated.G4.gr <- subsetByOverlaps(G4access.ori.G4H.gr, snvs.ori.cluster.2.gr) # 179 G4s

ks.test(abs(G4access.ori.mutated.G4.gr$G4H), abs(G4access.ori.G4H.gr$G4H))
# p-value = 4.916e-09

# Prepare df

G4access.ori.mutated.G4.df <- as.data.frame(G4access.ori.mutated.G4.gr) %>% mutate(class = "mut.G4s")
G4access.ori.all.G4.df <- as.data.frame(G4access.ori.G4H.gr) %>% mutate(class = "all.G4s")
G4access.ori.G4.summary.df <- rbind(G4access.ori.mutated.G4.df, G4access.ori.all.G4.df)

# Mutated G4 are predicted to be more stable

# Indentify G4 subtypes within mutated and non mutated origin G4

mutated.G4.seq <- as.character(G4access.ori.mutated.G4.gr$G4.seq) # 179
non.mutated.G4.seq <- as.character(G4access.ori.G4H.gr$G4.seq) # 12809

mutated.G4.subtype.df <- G4.classification.func(mutated.G4.seq)
non.mutated.G4.subtype.df <- G4.classification.func(non.mutated.G4.seq)

# Compute stats and bind results

mutated.G4.subtype.summary.df <- mutated.G4.subtype.df %>% dplyr::select(motif, count.mutated = count) %>% mutate(freq.mutated = count.mutated/sum(count.mutated))
non.mutated.G4.subtype.summary.df <- non.mutated.G4.subtype.df %>% dplyr::select(motif, count.non.mutated = count) %>% mutate(freq.non.mutated = count.non.mutated/sum(count.non.mutated))
G4.subtype.summary.df <- mutated.G4.subtype.summary.df %>% left_join(non.mutated.G4.subtype.summary.df, by = "motif") %>% mutate(enr = log2(freq.mutated/freq.non.mutated))

# Mutated G4 are enriched for canonical (no bulges) short loop G4s

# Plot results

# Pies reporting the distribution of G4 subtypes

mutated.G4.subtype.pie <- mutated.G4.subtype.summary.df %>%
 mutate(Motifs = fct_relevel(motif, rev(c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others")))) %>%
ggplot(aes(x="", y=freq.mutated, fill=Motifs)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) + ggtitle("Mutated G4s") +
  scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#08306B", "#ADCC54","#FFEDA0", "#FD8D3C", "#D6604D")) +
  theme_void() + theme(aspect.ratio=1)
mutated.G4.subtype.pie

non.mutated.G4.subtype.pie <- non.mutated.G4.subtype.summary.df %>%
  mutate(Motifs = fct_relevel(motif, rev(c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others")))) %>%
  ggplot(aes(x="", y=freq.non.mutated, fill=Motifs)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) + ggtitle("All G4s") +
  scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#08306B", "#ADCC54","#FFEDA0", "#FD8D3C", "#D6604D")) +
  theme_void() + theme(aspect.ratio=1)
non.mutated.G4.subtype.pie

# Plot reporting the enrichment of each subtypes

G4.subtype.summary.plot <- G4.subtype.summary.df %>%
  mutate(Motif = fct_relevel(motif, c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others"))) %>%
  ggplot(aes(x=Motif, y=enr)) +
  geom_bar(stat = "identity", fill="#D56114") + ylab("Motif enrichement (log2 FC)") +
  geom_hline(yintercept=0, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.subtype.summary.plot

# Save all plots

ggarrange(G4access.ori.G4.summary.plot, G4.subtype.summary.plot, mutated.G4.subtype.pie, non.mutated.G4.subtype.pie, nrow = 2, ncol = 2)

pdf("./Manuscript/Nature Communications/Rplots/G4access.plots.pdf", width=6, height=6, useDingbats=FALSE)
ggarrange(G4access.ori.G4.summary.plot, G4.subtype.summary.plot, mutated.G4.subtype.pie, non.mutated.G4.subtype.pie, nrow = 2, ncol = 2)
dev.off()

4.6 G-quadruplex SBS signature

Extract the mutational profile of G-quadruplexes corrected for local trinucleotide composition.

Compute trinucleotide statistics for all mutated G4s.

# r
library(tibble)

# Add unique G4 id
G4.mut.gr <- subsetByOverlaps(G4.score.ori.gr, snvs.ori.cluster.2.gr)
G4.mut.gr$G4.id <- paste0("G4.", rep(1:length(G4.mut.gr)))
G4.mut.seq <- DNAStringSet(G4.mut.gr$seq) # 2,518 sequences

# Initialise trinucleotide columns
G4.mut.tri.stats.df <- tibble(tri = trinuc.sum.df$tri)
for (i in 1:length(G4.mut.seq)) {
  print(i/length(G4.mut.seq))
  id.i <- G4.ori.gr$G4.id[i]
  seq.i <- G4.mut.seq[i]
  orientation.i <- G4.ori.gr$class[i]
  if(orientation.i == "G4") {G4.seq.i <- seq.i} else {G4.seq.i <- reverseComplement(seq.i)}
  G4.tri <- trinucleotideFrequency(G4.seq.i, step=1, as.prob = FALSE)
  G4.tri.df <- t(G4.tri) %>% as.data.frame() %>% rownames_to_column("tri")  
  colnames(G4.tri.df) <- c("tri", id.i)
  G4.mut.tri.stats.df <- G4.mut.tri.stats.df %>% left_join(G4.tri.df, by = "tri")
}
saveRDS(G4.mut.tri.stats.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.mut.tri.stats.df.rds")

Plot G4 trinucleotide composition.

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load trinucleotide statistics
G4.mut.tri.stats.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.mut.tri.stats.df.rds")
G4.all.mut.tri.stats.df <- cbind.data.frame(tri = G4.mut.tri.stats.df[,1], tri.all = rowSums(G4.mut.tri.stats.df[,-1]))

# Plot
G4.all.mut.tri.stats.plot <- G4.all.mut.tri.stats.df %>% mutate(rank = dense_rank(dplyr::desc(tri.all))) %>%
  mutate(col = case_when(rank >= 10 ~ "1", T ~ "0")) %>% 
  ggplot(aes(x=rank, y=tri.all)) + 
  geom_point() + scale_y_log10(limits = c(10,20000)) +
  geom_text(aes(label = tri, color = col), angle = 45) + 
  scale_color_manual(values = c("black", NA), na.value = NA) + ylab("Trinuclotide occurence (log10 scale)") + xlab("Rank") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none")
G4.all.mut.tri.stats.plot

pdf("./Rplots/G4.tri.occurence.pdf", width=7, height=6, useDingbats=FALSE)
G4.all.mut.tri.stats.plot
dev.off()
## quartz_off_screen 
##                 2

Correct observed profile and assess strand bias

# r
library(plyranges)

# Load origin signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# Correct matrix
snvs.G4.cluster.2.corr.mut.mat <- snvs.ori.mut.cluster %>% as.data.frame() %>% dplyr::select(cluster.2) %>% tibble::rownames_to_column(var = "Tri.mut") %>% tidyr::separate(Tri.mut, c("D", "R", "A", "U"), sep = "\\[|>|]", remove = F) %>% mutate(tri = paste0(D, R, U, sep = "")) %>%
  dplyr::select(-D, -R, -A, -U, -Tri.mut) %>% left_join(G4.all.mut.tri.stats.df, by = "tri") %>%
  mutate(G4 = cluster.2/tri.all, G4.norm = G4/sum(G4)) %>% select(G4 = G4.norm)

# Add G4 mutational signature
snvs.ori.G4.mut.cluster <- cbind.data.frame(snvs.ori.mut.cluster, snvs.G4.cluster.2.corr.mut.mat) %>% as.matrix()
saveRDS(snvs.ori.G4.mut.cluster, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.G4.signatures.rds")

Plot

library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load signatures
snvs.ori.mut.cluster.G4.signatures.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.G4.signatures.rds")

# Plot comparative profiles
plot_compare_profiles(snvs.ori.mut.cluster.G4.signatures.mat[,2], snvs.ori.mut.cluster.G4.signatures.mat[,6], profile_names = c("cluster.2", "G4"), condensed = TRUE)

pdf("./Rplots/snvs.ori.cluster.2.vs.G4.pdf", width=5, height=6, useDingbats=FALSE)
plot_compare_profiles(snvs.ori.mut.cluster.G4.signatures.mat[,2], snvs.ori.mut.cluster.G4.signatures.mat[,6], profile_names = c("cluster.2", "G4"), condensed = TRUE)
dev.off() 
## quartz_off_screen 
##                 2

4.7 Contribution of G-quadruplex to cluster 2 SBS

Assess the contribution of G4 to overall mutation rates in cluster 2

# R

################################################################################
# Map G4-forming sequences within 100 nt windows around origins

# Load origin 10kb domains
ori.10kb.df <- read.table("./Dataset/ori/ori.10kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.10kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.10kb.gr <- makeGRangesFromDataFrame(ori.10kb.df, keep.extra.columns = T)
ori.10kb.gr <- ori.10kb.gr[seqnames(ori.10kb.gr) %in% c(1:22, "X", "Y")]
seqlevelsStyle(ori.10kb.gr) <- "UCSC"

# Recover sequences
ori.10kb.seq <- getSeq(Hsapiens, ori.10kb.gr)

# Map G4 forming sequences in each ori domain
G4.ori.map.df <- tibble()
for (i in 1:length(ori.10kb.gr)) {
  print(i/length(ori.10kb.gr))
  ori.id.i <- ori.10kb.gr$ori.id[i]
  seqnames.i <- seqnames(ori.10kb.gr)[i]
  ranges.i <- ranges(ori.10kb.gr)[i]
  ori.midpoint.i <- round((start(ranges.i)+end(ranges.i))/2)
  seq.i <- as.character(ori.10kb.seq[i])
  # Identify G4s
  G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
  G4.seq.length.i <- nchar(G4.seq.i)
  G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
  G4.seq.start.i <- G4.seq.pos.i[,1]
  G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
  G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
  G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
    if (length(G4.seq.i) == 0) {G4.summary.i = tibble()} else {
  G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i)}
  # Identify C4s
  C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
  C4.seq.length.i <- nchar(C4.seq.i)
  C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
  C4.seq.start.i <- C4.seq.pos.i[,1]
  C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
  C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
  C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
    if (length(C4.seq.i) == 0) {C4.summary.i = tibble()} else {
  C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i)}
  # Combine all ranges
  all.summary.i <- rbind(G4.summary.i, C4.summary.i)
    if (nrow(all.summary.i) == 0) {all.df.i = tibble()} else {
  all.gr.i <- reduce(makeGRangesFromDataFrame(all.summary.i))
  # Compute midpoints and compute distance to origin
  all.df.i <- as.data.frame(all.gr.i) %>% mutate(G4.midpoint = round((start+end)/2), ori.id = ori.id.i, ori.midpoint = ori.midpoint.i, G4.ori.dist = G4.midpoint-ori.midpoint)}
  # Append results
  G4.ori.map.df <- rbind(G4.ori.map.df, all.df.i)
}
saveRDS(G4.ori.map.df, "./002_Mut_signatures_Pan_cancer/rds/G4.ori.map.df.rds")

G4.ori.map.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/G4.ori.map.df.rds")
hist(G4.ori.map.df$G4.ori.dist, breaks = 100)

G4.ori.map.gr <- makeGRangesFromDataFrame(G4.ori.map.df, keep.extra.columns = T)

################################################################################
# Compute mutational burden at origins considering G4 forming sequences or not

# Load cluster 2 snvs coordinates

cluster.2.grl <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.grl.rds")
cluster.2.gr <- unlist(cluster.2.grl)

# Filter out snvs overlapping with G4 forming sequences

cluster.2.G4.overlap <- findOverlaps(cluster.2.gr, G4.ori.map.gr)
cluster.2.G4.gr <- cluster.2.gr[-queryHits(cluster.2.G4.overlap)]

# Compute snvs distances to origins

ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed")
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"

snvs.cluster.2.ori.nearest <- nearest(cluster.2.gr, ori.gr)
snvs.cluster.2.ori.nearest[is.na(snvs.cluster.2.ori.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values  
snvs.pos <- as.data.frame(ranges(cluster.2.gr))$start
ori.pos <- ori.gr[snvs.cluster.2.ori.nearest]
snvs.cluster.2.ori.nearest.df <- cbind.data.frame(snvs.pos, ori.pos) %>% mutate(dist = snvs.pos - start) %>% 
  filter(dist >= -10000 & dist <= 10000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>% 
  group_by(dist) %>% summarise(total.snvs.count = n())

snvs.cluster.2.G4.ori.nearest <- nearest(cluster.2.G4.gr, ori.gr)
snvs.cluster.2.G4.ori.nearest[is.na(snvs.cluster.2.G4.ori.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values  
snvs.pos <- as.data.frame(ranges(cluster.2.G4.gr))$start
ori.pos <- ori.gr[snvs.cluster.2.G4.ori.nearest]
snvs.cluster.2.G4.ori.nearest.df <- cbind.data.frame(snvs.pos, ori.pos) %>% mutate(dist = snvs.pos - start) %>% 
  filter(dist >= -10000 & dist <= 10000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>% 
  group_by(dist) %>% summarise(snvs.no.G4.count = n())

snvs.cluster.2.G4.summary <- snvs.cluster.2.ori.nearest.df %>% left_join(snvs.cluster.2.G4.ori.nearest.df) 

plot(snvs.cluster.2.ori.nearest.df$dist, snvs.cluster.2.ori.nearest.df$total.snvs.count, col = "red", xlim = c(-5000, 5000))
points(snvs.cluster.2.G4.ori.nearest.df$dist, snvs.cluster.2.G4.ori.nearest.df$snvs.no.G4.count, col = "blue")

# Correct snvs count per base coverage

ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"

base.coverage.df <- as.data.frame(ori.10kb.domain.100nt.split.gr) %>% group_by(name) %>%
  summarise(base = sum(width)) %>% mutate(dist = (as.numeric(name)-100)*100)

ori.G4.hits <- findOverlaps(ori.10kb.domain.100nt.split.gr, G4.ori.map.gr)
ori.G4.intersect <- pintersect(ori.10kb.domain.100nt.split.gr[queryHits(hits)],
                      G4.ori.map.gr[subjectHits(hits)])
G4.coverage.df <-  as.data.frame(ori.G4.intersect) %>% group_by(name) %>%
  summarise(G4.cov = sum(width)) %>% mutate(dist = (as.numeric(name)-100)*100) %>%
  left_join(base.coverage.df) %>% mutate(G4.base = base-G4.cov) %>% select(dist, base, G4.base)

snvs.cluster.2.G4.summary.df <- snvs.cluster.2.G4.summary %>% left_join(G4.coverage.df) %>%
  mutate(total.snvs.count.norm = total.snvs.count/base, snvs.no.G4.count.norm = snvs.no.G4.count/G4.base)
saveRDS(snvs.cluster.2.G4.summary.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.G4.summary.df.rds")

plot(snvs.cluster.2.G4.summary$dist, snvs.cluster.2.G4.summary$total.snvs.count.norm, type = "l", col = "red", xlim = c(-5000, 5000))
points(snvs.cluster.2.G4.summary$dist, snvs.cluster.2.G4.summary$snvs.no.G4.count.norm, type = "l", col = "blue")

snvs.cluster.2.G4.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.G4.summary.df.rds")

snvs.cluster.2.G4.summary.plot <- snvs.cluster.2.G4.summary %>%
  select(dist, all = total.snvs.count.norm, no.G4= snvs.no.G4.count.norm) %>%
  mutate(all = all*10e3, no.G4 = no.G4*10e3) %>% 
  gather(class, value, -dist) %>% 
  ggplot(aes(x = dist, y = value, color = class)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("#514B76", "#D56114")) +
  xlab("Distance from origin (bp)") + ylab("Mutational burden (mutations per kb)") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
snvs.cluster.2.G4.summary.plot

################################################################################
# Assess the contribution of G4 mutagenesis per cancer samples

# Load G4 coordinates
G4.ori.map.gr <- makeGRangesFromDataFrame(G4.ori.map.df, keep.extra.columns = T)
seqlevelsStyle(G4.ori.map.gr) <- "NCBI"

# Load manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10, V2, V3, V5)
colnames(SSM.manifest.df) <- c("donor.id", "project", "file.id", "object.id", "file.name")

# Select cluster 2 samples
SSM.cluster.2.manifest.df <- SSM.manifest.df %>% filter(donor.id %in% names(cluster.2.grl))

# Open ori and flanks coordinates
ori.1kb.domains.gr <- import("./Dataset/ori/BED/ori.1kb.hg19.bed")
ori.flanks.gr <- import("./Dataset/ori/BED/ori.flanks.hg19.bed")

# List vcf files
vcf.files <- list.files("./Dataset/ICGC/SSM/VCF/") # 3,892 files
vcf.files <- vcf.files[!grepl(".idx", vcf.files)]  # 3,892 snvs vcf files
cluster.2.vcf.files <- vcf.files[vcf.files %in% SSM.cluster.2.manifest.df$file.name] # 46 files

# Count snvs
snvs.cluster.2.count.df <- tibble()
for (i in 1:length(cluster.2.vcf.files)) {
  vcf.files.i <- cluster.2.vcf.files[i]
  print(vcf.files.i)
  df.i <- SSM.cluster.2.manifest.df %>% filter(file.name == vcf.files.i)
  donor.i <- df.i$donor.id
  path.i <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.i, sep = "")
  vcf.i <- read.table(gzfile(path.i), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5)
  bed.i <- vcf.i %>% mutate(end = V2 + 1) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  gr.i <- makeGRangesFromDataFrame(bed.i)
  # Total count
    ori.overlap.i <- subsetByOverlaps(gr.i, ori.1kb.domains.gr)
    snvs.ori.count.i <- length(ori.overlap.i)
    flanks.overlap.i <- subsetByOverlaps(gr.i, ori.flanks.gr)
    snvs.flanks.count.i <- length(flanks.overlap.i)
  # Count filtering G4 overlapping snvs
    ori.G4.overlap.i <- findOverlaps(ori.overlap.i, G4.ori.map.gr)
      ori.G4.overlap.2.i <- ori.overlap.i[-queryHits(ori.G4.overlap.i)]
      snvs.ori.no.G4.count.i <- length(ori.G4.overlap.2.i)
    flanks.G4.overlap.i <- findOverlaps(flanks.overlap.i, G4.ori.map.gr)
      flanks.G4.overlap.2.i <- flanks.overlap.i[-queryHits(flanks.G4.overlap.i)]
      snvs.flanks.no.G4.count.i <- length(flanks.G4.overlap.2.i)
  # Combine information
  df.summary.i <- cbind.data.frame(donor = donor.i, snvs.ori.count = snvs.ori.count.i, snvs.flanks.count = snvs.flanks.count.i, snvs.ori.no.G4.count = snvs.ori.no.G4.count.i, snvs.flanks.no.G4.count = snvs.flanks.no.G4.count.i)
  snvs.cluster.2.count.df <- rbind(snvs.cluster.2.count.df, df.summary.i)
}
saveRDS(snvs.cluster.2.count.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.count.df.rds")

# Compute mutational burden
ori.df <- snvs.cluster.2.G4.summary.2 %>% filter(dist >= 1000 & dist <= 1000)
flanks.df <- snvs.cluster.2.G4.summary.2 %>% filter(dist < 1000 | dist > 1000)
  ori.base <- sum(ori.df$base)
  ori.no.G4.base <- sum(ori.df$G4.base)
  flanks.base <- sum(flanks.df$base)
  flanks.no.G4.base <- sum(flanks.df$G4.base)
snvs.cluster.2.mut.burden.df <- snvs.cluster.2.count.df %>% 
  mutate(snvs.ori.burden = snvs.ori.count/ori.base,
         snvs.ori.no.G4.burden = snvs.ori.no.G4.count/ori.no.G4.base,
         snvs.flanks.burden = snvs.flanks.count/flanks.base,
         snvs.flanks.no.G4.burden = snvs.flanks.no.G4.count/flanks.no.G4.base)
saveRDS(snvs.cluster.2.mut.burden.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.mut.burden.df.rds")

# Compute contribution of G4 forming sequences to origin mutational burden
snvs.cluster.2.mut.burden.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.mut.burden.df.rds")
snvs.cluster.2.G4.contribution.df <- snvs.cluster.2.mut.burden.df %>%
  mutate(ori.ratio = snvs.ori.no.G4.burden/snvs.ori.burden, flanks.ratio = snvs.flanks.no.G4.burden/snvs.flanks.burden) %>% 
  mutate(ori = (1-ori.ratio)*100, flanks = (1-flanks.ratio)*100)

snvs.cluster.2.G4.contribution.plot <- snvs.cluster.2.G4.contribution.df %>% select(donor, ori, flanks) %>% 
  gather(class, value, -donor) %>% 
  ggplot(aes(x=class, y=value, alpha = 0.3)) +
  geom_jitter(color="black", size=0.5, alpha=0.4) + ylim(-2, 100) +
  geom_boxplot(width = 0.5, fill="#D56114") + ylab("Contribution of G4 sequences\nto mutational burden (%)") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.cluster.2.G4.contribution.plot

summary(snvs.cluster.2.G4.contribution.df$ori)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 4.83   19.10   31.65   33.02   46.52   72.52 
# G4s contribute in average to ~ 33 % and up to 72 % to mutagenesis at origins 

# Prepare and save final plot

pdf("./Manuscript/Nature Communications/Rplots/G4.mut.burden.pdf", width=12, height=4, useDingbats=FALSE)
ggarrange(snvs.cluster.2.G4.summary.plot, snvs.cluster.2.G4.contribution.plot, nrow = 1)
dev.off()

4.8 Cluster 2 SBS signature strand bias

Assess potential strand biases associated with cluster 2 SBS signature.

# r

# Load cluster 2 snvs gr list
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")

# Split pyrimidine and purine mutations
cluster.2.pyr.grl <- cluster.2.snvs.grl
cluster.2.pur.grl <- cluster.2.snvs.grl
for (i in 1:length(cluster.2.snvs.grl)) {
  print(i/length(cluster.2.snvs.grl))
  gr.i <- cluster.2.snvs.grl[i]
  bin.i <- names(gr.i)
  unlist.gr.i <- unlist(gr.i)
  pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
  pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
  cluster.2.pyr.grl[i] <- pyr.gr.comp.i
  pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
  pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
  cluster.2.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.2.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pyr.grl.rds")
saveRDS(cluster.2.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pur.grl.rds")

# Compute mutational matrices
cluster.2.pyr.mat <- mut_matrix(vcf_list = cluster.2.pyr.grl, ref_genome = ref.genome)
cluster.2.pur.mat <- mut_matrix(vcf_list = cluster.2.pur.grl, ref_genome = ref.genome)

# Fit signatures
cluster.2.pyr.fit <- fit_to_signatures(cluster.2.pyr.mat, snvs.ori.G4.mut.cluster)
cluster.2.pur.fit <- fit_to_signatures(cluster.2.pur.mat, snvs.ori.G4.mut.cluster)

# Format
cluster.2.pyr.fit.df <- as.data.frame(cluster.2.pyr.fit$contribution)
cluster.2.pyr.fit.df <- as.data.frame(t(cluster.2.pyr.fit.df)) %>% rownames_to_column(var = "dist") %>% select(dist, value = cluster.2) %>% mutate(strand = "plus")
cluster.2.pur.fit.df <- as.data.frame(cluster.2.pur.fit$contribution)
cluster.2.pur.fit.df <- as.data.frame(t(cluster.2.pur.fit.df)) %>% rownames_to_column(var = "dist") %>% select(dist, value = cluster.2) %>% mutate(strand = "minus")
cluster.2.strand.fit.df <- rbind(cluster.2.pyr.fit.df, cluster.2.pur.fit.df) 
cluster.2.strand.fit.df$dist <- as.numeric(cluster.2.strand.fit.df$dist)
saveRDS(cluster.2.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

cluster.2.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
cluster.2.strand.fit.plot <- cluster.2.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("#514B76", "#D56114")) +
  xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 1 contribution") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.strand.fit.plot

pdf("./Rplots/cluster.2.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.2.strand.fit.plot
dev.off()
## quartz_off_screen 
##                 2

This signature do not show any strand bias.

4.9 Patterns of mutation at G-quadruplexes

Assess the distribution of snvs within G4s when oriented toward the direction of replication.

# r

# Compute distances of identified G4s to origins and assign a replication strand (leading or lagging).

G4.score.ori.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")
G4.score.ori.gr <- makeGRangesFromDataFrame(G4.score.ori.df, keep.extra.columns = T)

ori.inter.dist.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t")
colnames(ori.inter.dist.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.inter.dist.gr <- makeGRangesFromDataFrame(ori.inter.dist.df, keep.extra.columns = T)
seqlevelsStyle(ori.inter.dist.gr) <- "UCSC"

G4.ori.dist <- nearest(G4.score.ori.gr, ori.inter.dist.gr)
ori.dist.gr <- ori.inter.dist.gr[G4.ori.dist]
G4.ori.dist.gr <- as.data.frame(G4.score.ori.gr) %>% mutate(ori.pos = start(ori.dist.gr), ori.dist = start-ori.pos) %>%
  mutate(ori.pos = case_when(ori.dist <= 0 ~ "upstream", T ~ "downtream")) %>%
  mutate(rep.strand = case_when((ori.pos == "upstream" & class == "G4") ~ "leading",
                                (ori.pos == "downtream" & class == "C4") ~ "leading",
                                T ~ "lagging")) %>% 
  makeGRangesFromDataFrame(keep.extra.columns = T)
saveRDS(G4.ori.dist.gr, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.dist.gr.rds")

# Compute oriented distances of mapped snvs
# G4 motif start at 6nt of the reported position due to the specific regex we used, so we add/remove 7 to the mapped position to have the start of the G4 motif at position 0

snvs.G4.gr <- snvs.ori.cluster.2.gr
snvs.G4.dist <- nearest(snvs.G4.gr, G4.ori.dist.gr)
G4.start <- start(G4.ori.dist.gr[snvs.G4.dist])+7
G4.end <- end(G4.ori.dist.gr[snvs.G4.dist])-7
G4.ori.dist <- G4.ori.dist.gr[snvs.G4.dist]$ori.dist
G4.class <- G4.ori.dist.gr[snvs.G4.dist]$class
snvs.G4.dist.df <- cbind.data.frame(as.data.frame(snvs.G4.gr), G4.start, G4.end, G4.ori.dist, G4.class) %>% 
  mutate(G4.snv.dist = case_when(G4.ori.dist < 0 ~ G4.end-start,
                                 G4.ori.dist > 0 ~ G4.start-start))
saveRDS(snvs.G4.dist.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.G4.dist.df.rds")

# Extract G4 domain and compute G frequency for G4 visualisation

G4.up.df <- snvs.G4.dist.df %>% filter(G4.ori.dist > 0 & G4.class == "G4") %>% dplyr::select(seqnames, start = G4.start) %>% 
  mutate(start = start-52, end = start+150) # rephase and expand domains
G4.up.gr <- makeGRangesFromDataFrame(G4.up.df)
G4.up.views <- Views(Hsapiens, G4.up.gr)
G4.up.consMat <- consensusMatrix(G4.up.views, as.prob=T, shift=0L, baseOnly=TRUE, width=150)
G4.up.consMat.df <- cbind.data.frame(dist = seq(-49,100,1), G.freq.up = G4.up.consMat[3,])

G4.down.df <- snvs.G4.dist.df %>% filter(G4.ori.dist < 0 & G4.class == "G4") %>% dplyr::select(seqnames, start = G4.end) %>% 
  mutate(end = start+51, start = end-150, ) # rephase and expand domains
G4.down.gr <- makeGRangesFromDataFrame(G4.down.df)
G4.down.views <- Views(Hsapiens, G4.down.gr)
G4.down.consMat <- consensusMatrix(G4.down.views, as.prob=T, shift=0L, baseOnly=TRUE, width=150)
G4.down.consMat.df <- cbind.data.frame(dist = rev(seq(-50,99,1)), G.freq.down = G4.down.consMat[3,])

# Combine information
G4.consMat.df <- G4.up.consMat.df %>% left_join(G4.down.consMat.df, by = "dist") %>% mutate(G.freq = (G.freq.up+G.freq.down)/2)
saveRDS(G4.consMat.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.consMat.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
snvs.G4.dist.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.G4.dist.df.rds")
G4.consMat.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.consMat.df.rds")

# Plot mutation patterns at G4s
snvs.G4.dist.plot <- snvs.G4.dist.df %>% filter(G4.snv.dist >= -50 & G4.snv.dist <= 100) %>% ggplot(aes(x=G4.snv.dist)) +
  geom_histogram(binwidth=1, fill="#D56014", color="#010101", linewidth = 0.25) + xlab("Position from G4 (nt)") + ylab("snvs count") + xlim(-50,50) +
  theme_bw() + theme(aspect.ratio=0.5, panel.grid.minor = element_line(linetype = "dotted"))
snvs.G4.dist.plot

# Plot G statistics at phases G4s
G4.consMat.plot <- G4.consMat.df %>% ggplot(aes(x=dist, y=G.freq)) +
  geom_line(color = "#4F4A75") + xlab("Position from G4 (nt)") + ylab("G frequency") + xlim(-50,50) +
  theme_bw() + theme(aspect.ratio=0.5, panel.grid.minor = element_line(linetype = "dotted"))
G4.consMat.plot

# Save plots
pdf("./Rplots/G4.mut.density.pdf", width=8, height=6, useDingbats=FALSE)
ggarrange(snvs.G4.dist.plot, G4.consMat.plot, nrow = 2)
dev.off()
## quartz_off_screen 
##                 2

4.10 Polymerase usage at origins

To absence of strand bias for mutagenesis at G4s in the vicinity of origins suggests that these sites are being replicated by the same polymerase.

Fit COSMIC signatures associated with deficient pol epsilon and delta activity. From Robinson et al. Nature Genetics 2021, 53, 1434–1442 (https://www.nature.com/articles/s41588-021-00930-y).

Download individual signatures SBS10a,b (POLE) and SBS10c,d(POLD) from the COSMIC signature websites: https://cancer.sanger.ac.uk/signatures/sbs/ and fit their contribution to constitutive origins.

# r
SBS10a <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10a_PROFILE.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10a = SBS10a_GRCh37)
SBS10b <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10b_PROFILE.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10b = SBS10b_GRCh37)
SBS10c <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10c_PROFILE_cxbZBIy.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10c = SBS10c_GRCh37)
SBS10d <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10d_PROFILE_IJAo3WV.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10d = SBS10d_GRCh37)
SBS.pol <- SBS10a %>% full_join(SBS10b) %>% full_join(SBS10c) %>% full_join(SBS10d) %>% dplyr::select(-tri) %>% as.matrix()

cluster.2.Pol.fit <- fit_to_signatures_strict(cluster.2.mut.mat, SBS.pol, max_delta = 0.004)
cluster.2.Pol.fit.df <- as.data.frame(cluster.2.Pol.fit$fit_res$contribution)
cluster.2.Pol.fit.df <- as.data.frame(t(cluster.2.Pol.fit.df))
cluster.2.Pol.fit.filter.df <- cluster.2.Pol.fit.df %>% dplyr::select_if(~any(. >= 10)) %>% rownames_to_column(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.2.Pol.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.2.Pol.fit.filter.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

cluster.2.Pol.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.Pol.fit.filter.df.rds")

cluster.2.Pol.epsilon.fit.plot <- cluster.2.Pol.fit.filter.df %>% filter(sig == "SBS10b") %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c("#0775B6")) + ylim(-10,40) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("Polymerase signature exposure\n(absolute contribution)") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.Pol.epsilon.fit.plot

cluster.2.Pol.delta.fit.plot <- cluster.2.Pol.fit.filter.df %>% filter(sig == "SBS10c") %>% 
  ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
  scale_color_manual(values = c("#D56114")) + ylim(20,70) +
  xlim(-5000,5000) + 
  xlab("Distance from origin (bp)") + ylab("Polymerase signature exposure\n(absolute contribution)") +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.Pol.delta.fit.plot

pdf("./Rplots/cluster.2.Pol.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.2.Pol.epsilon.fit.plot, cluster.2.Pol.delta.fit.plot, nrow = 1, ncol = 2)
dev.off()
## quartz_off_screen 
##                 2

5 AID hyperactivity at origins in malignant lymphomas

Cluster 4 of mutational signatures suggest differential AID activity at replication origins compared to background.

In this section, we aim to characterise AID activity at origins and correlation with local variation in mutation burden.

5.1 Mutations at canonical AID motifs

AID activity is assessed by tracking C to G/T mutations within AID specific WRCY motifs, where W = A or T, R = purine and Y = pyrimidine (see Hernández-Verdin et al. npj Precision Oncology, 2022, 6, 89; Kasar et al. Nature commun. 2015, 6, 8866 for examples).

Compute mutational profiles at origins and origin flanks in larger contexts from genomes of cluster 4.

# r

# Load gr lists
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")

# Select cluster 4 genomes

clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.4.id <- (clusters.df %>% filter(cluster == 4))$donor.id # 23 genomes

snvs.ori.cluster.4.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.4.id]
snvs.flanks.cluster.4.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.4.id]

# Extract contexts

# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19

snvs.ori.cluster.4.mut.mat.ext.context <- mut_matrix(unlist(snvs.ori.cluster.4.grl), ref_genome = ref.genome, extension = 2)
colnames(snvs.ori.cluster.4.mut.mat.ext.context) <- c("ori")
snvs.flanks.cluster.4.mut.mat.ext.context <- mut_matrix(unlist(snvs.flanks.cluster.4.grl), ref_genome = ref.genome, extension = 2)
colnames(snvs.flanks.cluster.4.mut.mat.ext.context) <- c("flanks")
snvs.cluster.4.mut.mat.ext.context <- cbind(snvs.ori.cluster.4.mut.mat.ext.context, snvs.flanks.cluster.4.mut.mat.ext.context)
saveRDS(snvs.cluster.4.mut.mat.ext.context, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.4.mut.mat.ext.context.rds")

Plot mutation extended profiles

library(dplyr)
library(ggplot2)
library(MutationalPatterns)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load mutation matrix
snvs.cluster.4.mut.mat.ext.context <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.4.mut.mat.ext.context.rds")

ext.profile.plot <- plot_profile_heatmap(snvs.cluster.4.mut.mat.ext.context, by = c("ori", "flanks")) + theme(aspect.ratio=0.7)
river.plot <- plot_river(snvs.cluster.4.mut.mat.ext.context) + theme(aspect.ratio=0.4)
ext.profile.plot

river.plot

pdf("./Rplots/cluster.4.extended.profile.pdf", width=15, height=12, useDingbats=FALSE)
ggarrange(ext.profile.plot, river.plot, nrow = 2)
dev.off()
## quartz_off_screen 
##                 2

Extract motif sequences for DNA logos

# r

# Prepare df
snvs.cluster.logo.df <- as.data.frame(snvs.cluster.4.mut.mat.ext.context)
context <- data.frame(do.call(rbind, strsplit(as.character(row.names(snvs.cluster.logo.df)), ""))) %>% dplyr::select(-X3, -X5, -X7)
colnames(context) <- c("u2", "u1", "REF", "ALT", "d1", "d2")
snvs.cluster.logo.df <- cbind.data.frame(snvs.cluster.logo.df, context)

# Prepare matrices for logos

# At origins
ori.logo.mat.u2 <- as.data.frame(table(rep(snvs.cluster.logo.df$u2, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, u2 = Freq)
ori.logo.mat.u1 <- as.data.frame(table(rep(snvs.cluster.logo.df$u1, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, u1 = Freq)
ori.logo.mat.REF <- as.data.frame(table(rep(snvs.cluster.logo.df$REF, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, REF = Freq)
ori.logo.mat.d1 <- as.data.frame(table(rep(snvs.cluster.logo.df$d1, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, d1 = Freq)
ori.logo.mat.d2 <- as.data.frame(table(rep(snvs.cluster.logo.df$d2, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, d2 = Freq)
ori.logo.mat <- ori.logo.mat.u2 %>% left_join(ori.logo.mat.u1) %>% left_join(ori.logo.mat.REF) %>% left_join(ori.logo.mat.d1) %>% left_join(ori.logo.mat.d2) %>% dplyr::select(-seq)
row.names(ori.logo.mat) <- c("A", "C", "G", "T")
saveRDS(ori.logo.mat, file = "./002_Mut_signatures_Pan_cancer/sig/ori.logo.mat.rds")

# At origin flanks
flanks.logo.mat.u2 <- as.data.frame(table(rep(snvs.cluster.logo.df$u2, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, u2 = Freq)
flanks.logo.mat.u1 <- as.data.frame(table(rep(snvs.cluster.logo.df$u1, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, u1 = Freq)
flanks.logo.mat.REF <- as.data.frame(table(rep(snvs.cluster.logo.df$REF, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, REF = Freq)
flanks.logo.mat.d1 <- as.data.frame(table(rep(snvs.cluster.logo.df$d1, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, d1 = Freq)
flanks.logo.mat.d2 <- as.data.frame(table(rep(snvs.cluster.logo.df$d2, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, d2 = Freq)
flanks.logo.mat <- flanks.logo.mat.u2 %>% left_join(flanks.logo.mat.u1) %>% left_join(flanks.logo.mat.REF) %>% left_join(flanks.logo.mat.d1) %>% left_join(flanks.logo.mat.d2) %>% dplyr::select(-seq)
row.names(flanks.logo.mat) <- c("A", "C", "G", "T")
saveRDS(flanks.logo.mat, file = "./002_Mut_signatures_Pan_cancer/sig/flanks.logo.mat.rds")

Plot DNA logos

library(dplyr)
library(ggplot2)
library(ggseqlogo)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
ori.logo.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/ori.logo.mat.rds")
flanks.logo.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/flanks.logo.mat.rds")

logo.col <- make_col_scheme(chars=c('A', 'T', 'C', 'G'),cols=c('#ADCC55', '#DF1F18', '#33BAED', '#E6A004'))
logo.ori <- ggplot() + geom_logo(as.matrix(ori.logo.mat), col_scheme=logo.col, method = "probability") + theme_logo() + theme(aspect.ratio=0.4) + ggtitle("origins")
logo.flanks <- ggplot() + geom_logo(as.matrix(flanks.logo.mat), col_scheme=logo.col, method = "probability") + theme_logo() + theme(aspect.ratio=0.4) + ggtitle("flanks")

pdf("./Rplots/cluster.4.DNA.logo.pdf", width=5, height=5, useDingbats=FALSE)
ggarrange(logo.ori, logo.flanks, nrow = 2)
dev.off()
## quartz_off_screen 
##                 2

Compute the ratio of mutations occurring at AID canonical WRCY motifs over total mutations in bins of 100bp around origins

# r

# Load grange lists
cluster.4.snvs.bin.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")

# Extract contexts
snvs.ori.cluster.4.mut.mat.bin.ext.context <- mut_matrix(cluster.4.snvs.bin.grl, ref_genome = ref.genome, extension = 2)

# Define AID canonical WR[C>T]YN and WR[C>G]YN motifs (W = A or T, R = purine and Y = pyrimidine)
AID.can.motifs <- expand.grid(c("A", "T"), c("A", "G"), c("[C>T]", "[C>G]"), c("C", "T"), c("A", "C", "G", "T")) %>% mutate(AID.context = paste0(Var1, Var2, Var3, Var4, Var5))
length(AID.can.motifs$AID.context) # 64 motifs

# Compute ratio
cluster.4.AID.ratio.df <- tibble()
for (i in 1:ncol(snvs.ori.cluster.4.mut.mat.bin.ext.context)) {
  print(i/ncol(snvs.ori.cluster.4.mut.mat.bin.ext.context))
  dist.i <- as.numeric(colnames(snvs.ori.cluster.4.mut.mat.bin.ext.context)[i])
  mut.df.i <- cbind.data.frame(context = rownames(snvs.ori.cluster.4.mut.mat.bin.ext.context), value = as.vector(snvs.ori.cluster.4.mut.mat.bin.ext.context[,i]))
    count.AID.cont.i <- mut.df.i %>% filter(context %in% AID.can.motifs$AID.context)
    count.non.AID.cont.i <- mut.df.i %>% filter(!context %in% AID.can.motifs$AID.context)
    ratio.AID.i <- sum(count.AID.cont.i$value) / (sum(count.AID.cont.i$value) + sum(count.non.AID.cont.i$value))
  ratio.df.i <- cbind.data.frame(dist = dist.i, ratio.AID = ratio.AID.i)
  cluster.4.AID.ratio.df <- rbind(cluster.4.AID.ratio.df, ratio.df.i)
}
saveRDS(cluster.4.AID.ratio.df, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.AID.ratio.df.rds")

# As a control, compute the ratio of Cs in a AID canonical motifs compared to the total number of C in 100bp bins around origins

# Load domain/bin coordinates
ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
ori.10kb.domain.100nt.split.gr <- ori.10kb.domain.100nt.split.gr[seqnames(ori.10kb.domain.100nt.split.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"

# Recover sequences and compute base composition statistics
ori.views <- Views(Hsapiens, ori.10kb.domain.100nt.split.gr)
ori.seq <- DNAStringSet(ori.views)

stat.AID.seq.df <- tibble()
for (i in 1:length(ori.seq)) {
  print(i/length(ori.seq))
  seq.i <- as.character(ori.seq[i])
      C.count.i <- str_count(seq.i, pattern = "C")
      G.count.i <- str_count(seq.i, pattern = "G")
      AID.mot.plus.count.i <- str_count(seq.i, pattern = "[A|T][A|G]C[C|T]")
      AID.mot.minus.count.i <- str_count(seq.i, pattern = "[G|A]G[T|C][T|A]")
      df.i <- cbind.data.frame(C.count = C.count.i, G.count = G.count.i, AID.mot.plus.count = AID.mot.plus.count.i, AID.mot.minus.count = AID.mot.minus.count.i)
  stat.AID.seq.df <- rbind(stat.AID.seq.df, df.i)
}
stat.AID.seq.df$bin <- ori.10kb.domain.100nt.split.gr$name
saveRDS(stat.AID.seq.df, "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.df.rds")
stat.AID.seq.summary.df <- stat.AID.seq.df %>% group_by(bin) %>% summarise(C.count = sum(C.count), G.count = sum(G.count), AID.mot.plus.count = sum(AID.mot.plus.count), AID.mot.minus.count = sum(AID.mot.minus.count))
saveRDS(stat.AID.seq.summary.df, "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.summary.df.rds")

Plot results

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load results
cluster.4.AID.ratio.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.AID.ratio.df.rds")
stat.AID.seq.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.summary.df.rds")

# Format and combine information
cluster.4.AID.ratio.df <- cluster.4.AID.ratio.df %>% dplyr::rename(value = ratio.AID) %>% mutate(data = "snvs")
stat.AID.seq.summary.2.df <- stat.AID.seq.summary.df %>% mutate(dist = (as.numeric(bin)-100)*100, value = (AID.mot.plus.count+AID.mot.minus.count)/(C.count+G.count)) %>% dplyr::select(dist, value) %>% mutate(data = "motif")
cluster.4.AID.ratio.summary.df <- rbind(cluster.4.AID.ratio.df, stat.AID.seq.summary.2.df)

cluster.4.AID.ratio.plot <- cluster.4.AID.ratio.summary.df %>% ggplot(aes(x = dist, y = value, color = data)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("black", "#E7A106")) + ylim(0,0.25) +
  xlab("Distance from origin (bp)") + ylab("Fraction at cAID motif") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.4.AID.ratio.plot

pdf("./Rplots/cluster.4.AID.ratio.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.4.AID.ratio.plot
dev.off()
## quartz_off_screen 
##                 2

5.2 AID gene expression

Assess correlation between AID expression and mutagenesis at origins.

Gene expression data (in TPM) is formatted as described in /cephfs2/pmurat/OriCan/003_Transcriptomics_cluster_analysis_v3_1.Rmd

This is a pan cancer analysis as there is no transcriptomic data for CMDI.

# r

# Load transcriptomics data
ICGC.transc.TPM.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

# Filter AID gene
AID.TPM.df <- ICGC.transc.TPM.df %>% filter(gene.id == "AICDA")

# Compute mean snvs density ratios in bin of AID expression
AID.TPM.summary.df <- AID.TPM.df %>% ungroup() %>% filter(TPM > 0) %>% mutate(bin = ntile(TPM, 10)) %>% group_by(bin) %>% 
  summarise(AID.TPM = mean(TPM, na.rm = T), snvs.dens.ratio.mean = mean(snvs.dens.ratio, na.rm = T), snvs.dens.ratio.sem = std.error(snvs.dens.ratio, na.rm = T))
saveRDS(AID.TPM.summary.df, "./002_Mut_signatures_Pan_cancer/rds/AID.TPM.summary.df.rds")

# Assess correlation
cor.test(AID.TPM.df$TPM, AID.TPM.df$snvs.dens.ratio, na.rm = T) # Rho = 0.2551857, p-value = 6.973e-12

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

AID.TPM.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/AID.TPM.summary.df.rds")
AID.TPM.summary.plot <- AID.TPM.summary.df %>%
  ggplot(aes(x=AID.TPM, y=snvs.dens.ratio.mean)) +
  geom_line(color = "#E7A106") +
  geom_errorbar(aes(ymin=snvs.dens.ratio.mean-snvs.dens.ratio.sem, ymax=snvs.dens.ratio.mean+snvs.dens.ratio.sem), width=.2, position=position_dodge(0.05), color = "#E7A106") +
  geom_point(shape = 21, size = 2, fill = "white", color = "#E7A106") +
  scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
  ylab("Mutation density ratio (ori/flanks)") + xlab("AICDA expression bin (log10 TPM)") + ggtitle("Rho = 0.255, p-value = 6.973e-12") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
AID.TPM.summary.plot

pdf("./Rplots/AID.TPM.summary.plot.pdf", width=7, height=6, useDingbats=FALSE)
AID.TPM.summary.plot
dev.off()
## quartz_off_screen 
##                 2

5.3 APOBEC contribution to origin mutagenesis

Assess the contribution of other cytidine deaminases to origin mutagenesis.

We focus herein on the family of APOBEC3 and compare snvs density ratio for MALY donors binned by APOBEC level of expression.

# r

# Load transcriptomics data
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

# Select gene of interest (APOBEC3s and AID as positive control)
GOI <- c("AICDA", "APOBEC3A", "APOBEC3B", "APOBEC3C", "APOBEC3D", "APOBEC3F", "APOBEC3G", "APOBEC3H")
# Bin donors for high or low level expression levels for each GOI
APOBEC.df <- ICGC.transc.TPM.df %>% ungroup() %>% filter(gene.id %in% GOI) %>% group_by(gene.id) %>% filter(TPM >= 0.01) %>% mutate(TPM.bin = ntile(TPM, 3)) %>%
  filter(TPM.bin != 2) %>% mutate(EXP = case_when(TPM.bin == 1 ~ "low", T ~ "high"))
saveRDS(APOBEC.df, "./002_Mut_signatures_Pan_cancer/rds/APOBEC.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

APOBEC.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/APOBEC.df.rds")
APOBEC.plot <- APOBEC.df %>%
  ggplot(aes(x=gene.id, y=snvs.dens.ratio, fill = EXP)) +
  geom_boxplot(alpha = 0.7) + ylab("Mutation density ratio (ori/flanks)") +
  scale_fill_manual(values = c("#E7A208", "#D4D2D2")) +
  geom_hline(yintercept=1, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"), axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
APOBEC.plot

# Compute stats
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.5, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.095941, p-value = 0.0952
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 0.0952
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.3276, p-value = 4.696e-13
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 4.697e-13
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.46067, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.35793, p-value = 8.882e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 8.346e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.23985, p-value = 1.695e-07
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 1.695e-07
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.38282, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.090022, p-value = 0.1241
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 0.1241

# Save plot
pdf("./Rplots/APOBEC.plot.pdf", width=7, height=6, useDingbats=FALSE)
APOBEC.plot
dev.off()
## quartz_off_screen 
##                 2

This analysis suggests that APOBEC3s may contribute to mutagenesis at origins.

We now assess signs of APOBEC3s activity at origins through mutational signature analysis.

# R

# Identify APOBEC signatures at origins in MALY genomes

# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")

# Recover MALY donors and associated matrices
MALY.donor.id <- MALY.snvs.dens.ratio.df %>% filter(donor %in% colnames(snvs.diff.mut.mat))
MALY.mut.mat <- snvs.diff.mut.mat %>% as.data.frame() %>% dplyr::select(all_of(MALY.donor.id$donor)) # 33 genomes

# Recover known signatures (COSMIC signatures)
COSMIC.signatures <- get_known_signatures()

# Select signatures relevant to malignant lymphomas
MALY.select <- c("SBS1", "SBS2", "SBS3", "SBS5", "SBS6", "SBS9", "SBS10a", "SBS10b", "SBS13", "SBS14", "SBS15", "SBS18", "SBS20", "SBS21", "SBS26", "SBS30", "SBS36", "SBS44", "SBS84", "SBS85")
MALY.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select(all_of(MALY.select)) %>% as.matrix()

# Fit COSMIC signatures to MALY genomes
MALY.fit.res <- fit_to_signatures_strict(MALY.mut.mat, MALY.signatures, max_delta = 0.004)

# Merge signatures that are non-related to deaminases
DEAMIN.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select("SBS2", "SBS13", "SBS84", "SBS85") %>% as.matrix()
non.DEAMIN.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select(-colnames(DEAMIN.signatures)) %>% as.matrix()

DEAMIN.fit.res <- MALY.fit.res$fit_res$contribution %>% as.data.frame() %>% mutate_all(~ ./sum(.))
DEAMIN.contr.df <- DEAMIN.fit.res[which(row.names(DEAMIN.fit.res) %in% colnames(DEAMIN.signatures)),]
no.DEAMIN.contr.df <- colSums(DEAMIN.fit.res[which(row.names(DEAMIN.fit.res) %in% colnames(non.DEAMIN.signatures)),])
DEAMIN.contribution <- rbind(DEAMIN.contr.df, no.DEAMIN.contr.df) %>% as.matrix()
row.names(DEAMIN.contribution) <- c("SBS2", "SBS13", "SBS84", "SBS85", "others")

# Melt df for plotting
DEAMIN.contribution.melt.df <- DEAMIN.contribution %>% as.data.frame() %>% rownames_to_column("SBS") %>% gather("donor", "value", -SBS)
saveRDS(DEAMIN.contribution.melt.df, "./002_Mut_signatures_Pan_cancer/rds/DEAMIN.contribution.melt.df.rds")

Plot contribution

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

DEAMIN.contribution.melt.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/DEAMIN.contribution.melt.df.rds")
plot.order <- DEAMIN.contribution.melt.df %>% filter(SBS == "others") %>% arrange(-dplyr::desc(value)) %>% mutate(donor = factor(donor))
DEAMIN.contribution.melt.plot <- DEAMIN.contribution.melt.df %>% mutate(donor = factor(donor, levels = plot.order$donor, ordered = TRUE)) %>% 
  ggplot(aes(fill=SBS, y=value, x=donor)) + 
  geom_bar(position="stack", stat="identity") + ylab("Relative contribution") +
  scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#4F4A75", "#E6A004", "#E41F1A")) +
  theme_bw() + theme(aspect.ratio=0.7, axis.title.x=element_blank(), panel.grid.major = element_blank(), axis.text.x=element_text(angle=45,hjust=1))
DEAMIN.contribution.melt.plot

# Save plot
pdf("./Rplots/DEAMIN.contribution.melt.plot.pdf", width=7, height=6, useDingbats=FALSE)
DEAMIN.contribution.melt.plot
dev.off()
## quartz_off_screen 
##                 2